diff --git a/.gitattributes b/.gitattributes
index 0f95a3eb12c46633af28b36dff19e7b9a2f0fab7..450694353054abb5890e37434207b87c07eeb1fc 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -382,3 +382,94 @@ productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_feature
productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text
productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text
productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..517d7bc37df454fdc9335a848e9d75c44555034f
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3465/trainer_state.json
@@ -0,0 +1,823 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3465,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2479030179977415,
+ "epoch": 0.1299545159194282,
+ "grad_norm": 1.519571304321289,
+ "learning_rate": 3.522207847653314e-05,
+ "loss": 2.093206329345703,
+ "mean_token_accuracy": 0.6068353663384914,
+ "num_tokens": 154518.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.932415626347065,
+ "epoch": 0.2599090318388564,
+ "grad_norm": 1.180830955505371,
+ "learning_rate": 7.11629748811588e-05,
+ "loss": 0.8930854797363281,
+ "mean_token_accuracy": 0.7708445385098457,
+ "num_tokens": 306733.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.7730373838543891,
+ "epoch": 0.3898635477582846,
+ "grad_norm": 0.7839977145195007,
+ "learning_rate": 0.00010710387128578447,
+ "loss": 0.7302116394042969,
+ "mean_token_accuracy": 0.8012136635184288,
+ "num_tokens": 446267.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6934178560972214,
+ "epoch": 0.5198180636777128,
+ "grad_norm": 0.666778564453125,
+ "learning_rate": 0.0001430447676904101,
+ "loss": 0.6505754852294922,
+ "mean_token_accuracy": 0.8195212116837501,
+ "num_tokens": 600256.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6900296103954315,
+ "epoch": 0.649772579597141,
+ "grad_norm": 0.6762415766716003,
+ "learning_rate": 0.00017898566409503577,
+ "loss": 0.6378536987304687,
+ "mean_token_accuracy": 0.8223087686300278,
+ "num_tokens": 738649.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.667421719878912,
+ "epoch": 0.7797270955165692,
+ "grad_norm": 0.5047685503959656,
+ "learning_rate": 0.00021492656049966144,
+ "loss": 0.6148524856567383,
+ "mean_token_accuracy": 0.8280292323231697,
+ "num_tokens": 883494.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.6388977643847465,
+ "epoch": 0.9096816114359974,
+ "grad_norm": 0.4360353350639343,
+ "learning_rate": 0.0002508674569042871,
+ "loss": 0.5933729553222656,
+ "mean_token_accuracy": 0.8329134130477905,
+ "num_tokens": 1032111.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6612381511009656,
+ "eval_loss": 0.6559221744537354,
+ "eval_mean_token_accuracy": 0.8195324820967821,
+ "eval_num_tokens": 1132140.0,
+ "eval_runtime": 53.4007,
+ "eval_samples_per_second": 31.03,
+ "eval_steps_per_second": 3.895,
+ "step": 385
+ },
+ {
+ "entropy": 0.6224366770916848,
+ "epoch": 1.0389863547758285,
+ "grad_norm": 0.5294668078422546,
+ "learning_rate": 0.00027673375518355765,
+ "loss": 0.5677951431274414,
+ "mean_token_accuracy": 0.8380465067211708,
+ "num_tokens": 1177556.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5827466724812984,
+ "epoch": 1.1689408706952567,
+ "grad_norm": 0.5172416567802429,
+ "learning_rate": 0.0002765120122346144,
+ "loss": 0.5423126983642578,
+ "mean_token_accuracy": 0.8467991036176682,
+ "num_tokens": 1325434.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5831517253816128,
+ "epoch": 1.2988953866146848,
+ "grad_norm": 0.41916292905807495,
+ "learning_rate": 0.0002760064270819138,
+ "loss": 0.534448013305664,
+ "mean_token_accuracy": 0.8456632816791534,
+ "num_tokens": 1474116.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5869986982643605,
+ "epoch": 1.428849902534113,
+ "grad_norm": 0.4387759566307068,
+ "learning_rate": 0.00027521803857633113,
+ "loss": 0.5367491912841796,
+ "mean_token_accuracy": 0.8462416216731071,
+ "num_tokens": 1621193.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5771756853163242,
+ "epoch": 1.5588044184535412,
+ "grad_norm": 0.49079665541648865,
+ "learning_rate": 0.00027414846665880935,
+ "loss": 0.5238623809814453,
+ "mean_token_accuracy": 0.84760089635849,
+ "num_tokens": 1767789.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5549105909466744,
+ "epoch": 1.6887589343729694,
+ "grad_norm": 0.4000363051891327,
+ "learning_rate": 0.0002727999090317863,
+ "loss": 0.510434226989746,
+ "mean_token_accuracy": 0.8517858856916427,
+ "num_tokens": 1918138.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.583413660377264,
+ "epoch": 1.8187134502923976,
+ "grad_norm": 0.33592426776885986,
+ "learning_rate": 0.00027117513664346674,
+ "loss": 0.5297993850708008,
+ "mean_token_accuracy": 0.846615691781044,
+ "num_tokens": 2057575.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.5732646904885769,
+ "epoch": 1.9486679662118258,
+ "grad_norm": 0.5528839230537415,
+ "learning_rate": 0.00026927748799421714,
+ "loss": 0.5219194793701172,
+ "mean_token_accuracy": 0.8489033079147339,
+ "num_tokens": 2208320.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6027900550801021,
+ "eval_loss": 0.5946928858757019,
+ "eval_mean_token_accuracy": 0.8318195798649237,
+ "eval_num_tokens": 2264280.0,
+ "eval_runtime": 53.3837,
+ "eval_samples_per_second": 31.039,
+ "eval_steps_per_second": 3.896,
+ "step": 770
+ },
+ {
+ "entropy": 0.5329899657611272,
+ "epoch": 2.077972709551657,
+ "grad_norm": 0.45793575048446655,
+ "learning_rate": 0.0002671108622767842,
+ "loss": 0.48420516967773436,
+ "mean_token_accuracy": 0.8578200301333289,
+ "num_tokens": 2348248.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5142687204480171,
+ "epoch": 2.207927225471085,
+ "grad_norm": 0.4690960645675659,
+ "learning_rate": 0.0002646797113644295,
+ "loss": 0.4593114471435547,
+ "mean_token_accuracy": 0.8622670090198516,
+ "num_tokens": 2501427.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5135884112119675,
+ "epoch": 2.3378817413905133,
+ "grad_norm": 0.3752821683883667,
+ "learning_rate": 0.00026198903066344565,
+ "loss": 0.4626216125488281,
+ "mean_token_accuracy": 0.8612511262297631,
+ "num_tokens": 2650794.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5137367483973503,
+ "epoch": 2.4678362573099415,
+ "grad_norm": 0.3726271390914917,
+ "learning_rate": 0.0002590443488488465,
+ "loss": 0.4601683807373047,
+ "mean_token_accuracy": 0.8620512077212333,
+ "num_tokens": 2798180.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5105714881420136,
+ "epoch": 2.5977907732293697,
+ "grad_norm": 0.41296717524528503,
+ "learning_rate": 0.00025585171650432525,
+ "loss": 0.46279102325439453,
+ "mean_token_accuracy": 0.8611763519048691,
+ "num_tokens": 2950301.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5169161760807037,
+ "epoch": 2.727745289148798,
+ "grad_norm": 0.4614253044128418,
+ "learning_rate": 0.0002524176936898197,
+ "loss": 0.45492774963378907,
+ "mean_token_accuracy": 0.8627680170536042,
+ "num_tokens": 3091810.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4989277676492929,
+ "epoch": 2.857699805068226,
+ "grad_norm": 0.37512704730033875,
+ "learning_rate": 0.00024874933646223225,
+ "loss": 0.4531984329223633,
+ "mean_token_accuracy": 0.8637665447592735,
+ "num_tokens": 3242184.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5177617704868317,
+ "epoch": 2.9876543209876543,
+ "grad_norm": 0.3700532019138336,
+ "learning_rate": 0.00024485418237699976,
+ "loss": 0.45844474792480466,
+ "mean_token_accuracy": 0.8626988258957863,
+ "num_tokens": 3382605.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5253989950108987,
+ "eval_loss": 0.5857328176498413,
+ "eval_mean_token_accuracy": 0.8360884573597175,
+ "eval_num_tokens": 3396420.0,
+ "eval_runtime": 53.341,
+ "eval_samples_per_second": 31.064,
+ "eval_steps_per_second": 3.899,
+ "step": 1155
+ },
+ {
+ "entropy": 0.4535361140517134,
+ "epoch": 3.116959064327485,
+ "grad_norm": 0.3412795662879944,
+ "learning_rate": 0.00024074023500030492,
+ "loss": 0.3942829132080078,
+ "mean_token_accuracy": 0.8781378038564519,
+ "num_tokens": 3522582.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.44747897461056707,
+ "epoch": 3.246913580246914,
+ "grad_norm": 0.46647050976753235,
+ "learning_rate": 0.0002364159474637521,
+ "loss": 0.38986759185791015,
+ "mean_token_accuracy": 0.8777281475067139,
+ "num_tokens": 3670864.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.4480265176296234,
+ "epoch": 3.3768680961663415,
+ "grad_norm": 0.4068582355976105,
+ "learning_rate": 0.00023189020509529866,
+ "loss": 0.39444759368896487,
+ "mean_token_accuracy": 0.8774515727162361,
+ "num_tokens": 3822021.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.45180007234215736,
+ "epoch": 3.50682261208577,
+ "grad_norm": 0.4249928593635559,
+ "learning_rate": 0.00022717230716213122,
+ "loss": 0.3977077102661133,
+ "mean_token_accuracy": 0.8762744688987731,
+ "num_tokens": 3968736.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4614932192862034,
+ "epoch": 3.636777128005198,
+ "grad_norm": 0.561008095741272,
+ "learning_rate": 0.00022227194776300045,
+ "loss": 0.4022808456420898,
+ "mean_token_accuracy": 0.8760285252332687,
+ "num_tokens": 4113509.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4414680179953575,
+ "epoch": 3.7667316439246266,
+ "grad_norm": 0.38943538069725037,
+ "learning_rate": 0.00021719919590927584,
+ "loss": 0.38586376190185545,
+ "mean_token_accuracy": 0.8783121705055237,
+ "num_tokens": 4267958.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.45685607343912127,
+ "epoch": 3.8966861598440543,
+ "grad_norm": 0.5362406969070435,
+ "learning_rate": 0.00021196447483564875,
+ "loss": 0.3983576583862305,
+ "mean_token_accuracy": 0.8764419692754746,
+ "num_tokens": 4415398.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49981127894268584,
+ "eval_loss": 0.5997208952903748,
+ "eval_mean_token_accuracy": 0.8368159819107789,
+ "eval_num_tokens": 4528560.0,
+ "eval_runtime": 53.4304,
+ "eval_samples_per_second": 31.012,
+ "eval_steps_per_second": 3.893,
+ "step": 1540
+ },
+ {
+ "entropy": 0.4439909208060509,
+ "epoch": 4.025990903183885,
+ "grad_norm": 0.5490113496780396,
+ "learning_rate": 0.00020657854058299564,
+ "loss": 0.38307292938232423,
+ "mean_token_accuracy": 0.8795150506436525,
+ "num_tokens": 4559534.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3837250977009535,
+ "epoch": 4.155945419103314,
+ "grad_norm": 0.5567234754562378,
+ "learning_rate": 0.0002010524598974076,
+ "loss": 0.3182963752746582,
+ "mean_token_accuracy": 0.8964017608761787,
+ "num_tokens": 4707075.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.377872094810009,
+ "epoch": 4.2858999350227425,
+ "grad_norm": 0.4315710961818695,
+ "learning_rate": 0.00019539758749079845,
+ "loss": 0.318333683013916,
+ "mean_token_accuracy": 0.8963816618919372,
+ "num_tokens": 4851683.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.38739304527640345,
+ "epoch": 4.41585445094217,
+ "grad_norm": 0.49140632152557373,
+ "learning_rate": 0.00018962554270981555,
+ "loss": 0.32688804626464846,
+ "mean_token_accuracy": 0.8937860554456711,
+ "num_tokens": 4994086.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.39157475270330905,
+ "epoch": 4.545808966861598,
+ "grad_norm": 0.40667369961738586,
+ "learning_rate": 0.00018374818566099208,
+ "loss": 0.3305763626098633,
+ "mean_token_accuracy": 0.8916732975840569,
+ "num_tokens": 5137171.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3838599680364132,
+ "epoch": 4.675763482781027,
+ "grad_norm": 0.4632417857646942,
+ "learning_rate": 0.0001777775928411983,
+ "loss": 0.3267818450927734,
+ "mean_token_accuracy": 0.8946500706672669,
+ "num_tokens": 5287076.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.38270787581801413,
+ "epoch": 4.805717998700455,
+ "grad_norm": 0.5529720187187195,
+ "learning_rate": 0.0001717260323234649,
+ "loss": 0.3264235305786133,
+ "mean_token_accuracy": 0.8948800846934318,
+ "num_tokens": 5436923.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.38736109718680384,
+ "epoch": 4.935672514619883,
+ "grad_norm": 0.5604785680770874,
+ "learning_rate": 0.00016560593854916497,
+ "loss": 0.3280513381958008,
+ "mean_token_accuracy": 0.8931388029456139,
+ "num_tokens": 5589195.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.4370518812479881,
+ "eval_loss": 0.6210553050041199,
+ "eval_mean_token_accuracy": 0.8379779781859654,
+ "eval_num_tokens": 5660700.0,
+ "eval_runtime": 53.4039,
+ "eval_samples_per_second": 31.028,
+ "eval_steps_per_second": 3.895,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3330705868988181,
+ "epoch": 5.064977257959714,
+ "grad_norm": 0.5386723875999451,
+ "learning_rate": 0.0001594298867783512,
+ "loss": 0.2754818344116211,
+ "mean_token_accuracy": 0.9101201346771202,
+ "num_tokens": 5739445.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2980620255321264,
+ "epoch": 5.1949317738791425,
+ "grad_norm": 0.5633581876754761,
+ "learning_rate": 0.00015321056725074549,
+ "loss": 0.23754241943359375,
+ "mean_token_accuracy": 0.9203532826900482,
+ "num_tokens": 5888043.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3111592583358288,
+ "epoch": 5.32488628979857,
+ "grad_norm": 0.5031015872955322,
+ "learning_rate": 0.0001469607591104745,
+ "loss": 0.24428102493286133,
+ "mean_token_accuracy": 0.917181601524353,
+ "num_tokens": 6031284.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.31522042460739613,
+ "epoch": 5.454840805717999,
+ "grad_norm": 0.6432453393936157,
+ "learning_rate": 0.0001406933041481286,
+ "loss": 0.25112478256225584,
+ "mean_token_accuracy": 0.9152472382783889,
+ "num_tokens": 6179927.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3046229027956724,
+ "epoch": 5.584795321637427,
+ "grad_norm": 0.5104537606239319,
+ "learning_rate": 0.00013442108041409814,
+ "loss": 0.2431495475769043,
+ "mean_token_accuracy": 0.917829519212246,
+ "num_tokens": 6322630.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.30440517760813235,
+ "epoch": 5.714749837556855,
+ "grad_norm": 0.5307765603065491,
+ "learning_rate": 0.0001281569757574053,
+ "loss": 0.24610313415527343,
+ "mean_token_accuracy": 0.9166415151953697,
+ "num_tokens": 6469843.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.304359400421381,
+ "epoch": 5.844704353476283,
+ "grad_norm": 0.5014523267745972,
+ "learning_rate": 0.00012191386134440133,
+ "loss": 0.24548973083496095,
+ "mean_token_accuracy": 0.9165477308630944,
+ "num_tokens": 6617768.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3141418205201626,
+ "epoch": 5.974658869395712,
+ "grad_norm": 0.567398726940155,
+ "learning_rate": 0.00011570456521174339,
+ "loss": 0.24975168228149414,
+ "mean_token_accuracy": 0.9139353120326996,
+ "num_tokens": 6761187.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.3758909202252443,
+ "eval_loss": 0.6843022108078003,
+ "eval_mean_token_accuracy": 0.8348075449466705,
+ "eval_num_tokens": 6792840.0,
+ "eval_runtime": 53.3825,
+ "eval_samples_per_second": 31.04,
+ "eval_steps_per_second": 3.896,
+ "step": 2310
+ },
+ {
+ "entropy": 0.24346149432000203,
+ "epoch": 6.1039636127355426,
+ "grad_norm": 0.7140825986862183,
+ "learning_rate": 0.00010954184590799172,
+ "loss": 0.17231273651123047,
+ "mean_token_accuracy": 0.9407721275660261,
+ "num_tokens": 6909578.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2160973483324051,
+ "epoch": 6.23391812865497,
+ "grad_norm": 0.49014952778816223,
+ "learning_rate": 0.00010343836627798716,
+ "loss": 0.15455107688903807,
+ "mean_token_accuracy": 0.9467655989527702,
+ "num_tokens": 7056244.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.21491924367845058,
+ "epoch": 6.363872644574399,
+ "grad_norm": 0.5529471635818481,
+ "learning_rate": 9.740666744387656e-05,
+ "loss": 0.1584029197692871,
+ "mean_token_accuracy": 0.9460993978381157,
+ "num_tokens": 7206950.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.22037068914622068,
+ "epoch": 6.493827160493828,
+ "grad_norm": 0.6232843995094299,
+ "learning_rate": 9.145914303624717e-05,
+ "loss": 0.15544342041015624,
+ "mean_token_accuracy": 0.9450622496008872,
+ "num_tokens": 7359429.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.2320463878661394,
+ "epoch": 6.623781676413255,
+ "grad_norm": 0.7459681630134583,
+ "learning_rate": 8.560801372831975e-05,
+ "loss": 0.16350215911865235,
+ "mean_token_accuracy": 0.9416968420147895,
+ "num_tokens": 7499281.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.22943626195192338,
+ "epoch": 6.753736192332683,
+ "grad_norm": 0.7482302784919739,
+ "learning_rate": 7.986530212552506e-05,
+ "loss": 0.16422538757324218,
+ "mean_token_accuracy": 0.9434959614276885,
+ "num_tokens": 7640758.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.21795938543975354,
+ "epoch": 6.883690708252112,
+ "grad_norm": 0.5210486054420471,
+ "learning_rate": 7.424280806206118e-05,
+ "loss": 0.15540474891662598,
+ "mean_token_accuracy": 0.9459306105971337,
+ "num_tokens": 7791986.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3157534837149657,
+ "eval_loss": 0.7744874954223633,
+ "eval_mean_token_accuracy": 0.8346395036922052,
+ "eval_num_tokens": 7924980.0,
+ "eval_runtime": 53.3771,
+ "eval_samples_per_second": 31.043,
+ "eval_steps_per_second": 3.897,
+ "step": 2695
+ },
+ {
+ "entropy": 0.2124774060656677,
+ "epoch": 7.012995451591943,
+ "grad_norm": 0.42238789796829224,
+ "learning_rate": 6.875208435518865e-05,
+ "loss": 0.14792531967163086,
+ "mean_token_accuracy": 0.9490461115861059,
+ "num_tokens": 7940521.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.15692965138703585,
+ "epoch": 7.142949967511371,
+ "grad_norm": 0.4711572229862213,
+ "learning_rate": 6.340441306708468e-05,
+ "loss": 0.09051708221435546,
+ "mean_token_accuracy": 0.9700166273117066,
+ "num_tokens": 8084193.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.15241683423519134,
+ "epoch": 7.272904483430799,
+ "grad_norm": 0.4312196671962738,
+ "learning_rate": 5.821078232303016e-05,
+ "loss": 0.08812363624572754,
+ "mean_token_accuracy": 0.9699361199140548,
+ "num_tokens": 8230159.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.1459079357981682,
+ "epoch": 7.402858999350228,
+ "grad_norm": 0.4804084002971649,
+ "learning_rate": 5.3181863733564636e-05,
+ "loss": 0.08675944328308105,
+ "mean_token_accuracy": 0.9703072866797448,
+ "num_tokens": 8380556.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.15621621005237102,
+ "epoch": 7.532813515269655,
+ "grad_norm": 0.5435478091239929,
+ "learning_rate": 4.83279904669986e-05,
+ "loss": 0.09016354560852051,
+ "mean_token_accuracy": 0.9674961140751839,
+ "num_tokens": 8523248.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.1545175113901496,
+ "epoch": 7.662768031189084,
+ "grad_norm": 0.524286687374115,
+ "learning_rate": 4.365913601734056e-05,
+ "loss": 0.09049141883850098,
+ "mean_token_accuracy": 0.9679373624920845,
+ "num_tokens": 8672002.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.1553485019877553,
+ "epoch": 7.792722547108512,
+ "grad_norm": 0.5006484389305115,
+ "learning_rate": 3.9184893711264495e-05,
+ "loss": 0.08913107872009278,
+ "mean_token_accuracy": 0.9684559822082519,
+ "num_tokens": 8816090.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.15444331549108029,
+ "epoch": 7.92267706302794,
+ "grad_norm": 0.5613893866539001,
+ "learning_rate": 3.491445699622611e-05,
+ "loss": 0.08711207389831543,
+ "mean_token_accuracy": 0.9684525722265244,
+ "num_tokens": 8966004.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.26580205430778175,
+ "eval_loss": 0.8996392488479614,
+ "eval_mean_token_accuracy": 0.8319417791297803,
+ "eval_num_tokens": 9057120.0,
+ "eval_runtime": 53.4042,
+ "eval_samples_per_second": 31.028,
+ "eval_steps_per_second": 3.895,
+ "step": 3080
+ },
+ {
+ "entropy": 0.14060429509860187,
+ "epoch": 8.05198180636777,
+ "grad_norm": 0.25378674268722534,
+ "learning_rate": 3.085660055023035e-05,
+ "loss": 0.07468742847442628,
+ "mean_token_accuracy": 0.9737296260181983,
+ "num_tokens": 9115929.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.12152639016509056,
+ "epoch": 8.1819363222872,
+ "grad_norm": 0.3634810447692871,
+ "learning_rate": 2.7019662252065798e-05,
+ "loss": 0.05918361663818359,
+ "mean_token_accuracy": 0.9785374769568443,
+ "num_tokens": 9260646.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.1228457903675735,
+ "epoch": 8.311890838206628,
+ "grad_norm": 0.32873299717903137,
+ "learning_rate": 2.3411526049051643e-05,
+ "loss": 0.060924801826477054,
+ "mean_token_accuracy": 0.9778030979633331,
+ "num_tokens": 9404737.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.11790491977706552,
+ "epoch": 8.441845354126055,
+ "grad_norm": 0.27871423959732056,
+ "learning_rate": 2.0039605757500512e-05,
+ "loss": 0.05871880531311035,
+ "mean_token_accuracy": 0.9786837643384934,
+ "num_tokens": 9552241.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.12911307733505964,
+ "epoch": 8.571799870045485,
+ "grad_norm": 0.4569152593612671,
+ "learning_rate": 1.691082982918235e-05,
+ "loss": 0.06450970649719238,
+ "mean_token_accuracy": 0.9761316785216332,
+ "num_tokens": 9689407.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.11636774389073253,
+ "epoch": 8.701754385964913,
+ "grad_norm": 0.27477338910102844,
+ "learning_rate": 1.403162711509129e-05,
+ "loss": 0.05784036159515381,
+ "mean_token_accuracy": 0.9791285961866378,
+ "num_tokens": 9842204.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.11710284009575844,
+ "epoch": 8.83170890188434,
+ "grad_norm": 0.26900890469551086,
+ "learning_rate": 1.1407913655766755e-05,
+ "loss": 0.05737146377563476,
+ "mean_token_accuracy": 0.9788037702441216,
+ "num_tokens": 9994524.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.11474769543856382,
+ "epoch": 8.961663417803768,
+ "grad_norm": 0.3137633204460144,
+ "learning_rate": 9.045080525311815e-06,
+ "loss": 0.057830405235290525,
+ "mean_token_accuracy": 0.9789029136300087,
+ "num_tokens": 10147597.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.23872991701444754,
+ "eval_loss": 1.007972002029419,
+ "eval_mean_token_accuracy": 0.8318104110658169,
+ "eval_num_tokens": 10189260.0,
+ "eval_runtime": 53.4047,
+ "eval_samples_per_second": 31.027,
+ "eval_steps_per_second": 3.895,
+ "step": 3465
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.43404829763072e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7b923a301af4113e0aa591d097678b1fa73025c
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.009078376988692594,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "up_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7047e60f094ad7d822bdcb7285421f0dbe913f2
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-385/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 385,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2479030179977415,
+ "epoch": 0.1299545159194282,
+ "grad_norm": 1.519571304321289,
+ "learning_rate": 3.522207847653314e-05,
+ "loss": 2.093206329345703,
+ "mean_token_accuracy": 0.6068353663384914,
+ "num_tokens": 154518.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.932415626347065,
+ "epoch": 0.2599090318388564,
+ "grad_norm": 1.180830955505371,
+ "learning_rate": 7.11629748811588e-05,
+ "loss": 0.8930854797363281,
+ "mean_token_accuracy": 0.7708445385098457,
+ "num_tokens": 306733.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.7730373838543891,
+ "epoch": 0.3898635477582846,
+ "grad_norm": 0.7839977145195007,
+ "learning_rate": 0.00010710387128578447,
+ "loss": 0.7302116394042969,
+ "mean_token_accuracy": 0.8012136635184288,
+ "num_tokens": 446267.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6934178560972214,
+ "epoch": 0.5198180636777128,
+ "grad_norm": 0.666778564453125,
+ "learning_rate": 0.0001430447676904101,
+ "loss": 0.6505754852294922,
+ "mean_token_accuracy": 0.8195212116837501,
+ "num_tokens": 600256.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6900296103954315,
+ "epoch": 0.649772579597141,
+ "grad_norm": 0.6762415766716003,
+ "learning_rate": 0.00017898566409503577,
+ "loss": 0.6378536987304687,
+ "mean_token_accuracy": 0.8223087686300278,
+ "num_tokens": 738649.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.667421719878912,
+ "epoch": 0.7797270955165692,
+ "grad_norm": 0.5047685503959656,
+ "learning_rate": 0.00021492656049966144,
+ "loss": 0.6148524856567383,
+ "mean_token_accuracy": 0.8280292323231697,
+ "num_tokens": 883494.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.6388977643847465,
+ "epoch": 0.9096816114359974,
+ "grad_norm": 0.4360353350639343,
+ "learning_rate": 0.0002508674569042871,
+ "loss": 0.5933729553222656,
+ "mean_token_accuracy": 0.8329134130477905,
+ "num_tokens": 1032111.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6612381511009656,
+ "eval_loss": 0.6559221744537354,
+ "eval_mean_token_accuracy": 0.8195324820967821,
+ "eval_num_tokens": 1132140.0,
+ "eval_runtime": 53.4007,
+ "eval_samples_per_second": 31.03,
+ "eval_steps_per_second": 3.895,
+ "step": 385
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.91184805036032e+16,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7b923a301af4113e0aa591d097678b1fa73025c
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.009078376988692594,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "v_proj",
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "up_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bcc12b96d5ae1ddf5b12c429240bd6898758939
--- /dev/null
+++ b/DBCA_code_Estonian/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-3850/trainer_state.json
@@ -0,0 +1,914 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3850,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2479030179977415,
+ "epoch": 0.1299545159194282,
+ "grad_norm": 1.519571304321289,
+ "learning_rate": 3.522207847653314e-05,
+ "loss": 2.093206329345703,
+ "mean_token_accuracy": 0.6068353663384914,
+ "num_tokens": 154518.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.932415626347065,
+ "epoch": 0.2599090318388564,
+ "grad_norm": 1.180830955505371,
+ "learning_rate": 7.11629748811588e-05,
+ "loss": 0.8930854797363281,
+ "mean_token_accuracy": 0.7708445385098457,
+ "num_tokens": 306733.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.7730373838543891,
+ "epoch": 0.3898635477582846,
+ "grad_norm": 0.7839977145195007,
+ "learning_rate": 0.00010710387128578447,
+ "loss": 0.7302116394042969,
+ "mean_token_accuracy": 0.8012136635184288,
+ "num_tokens": 446267.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6934178560972214,
+ "epoch": 0.5198180636777128,
+ "grad_norm": 0.666778564453125,
+ "learning_rate": 0.0001430447676904101,
+ "loss": 0.6505754852294922,
+ "mean_token_accuracy": 0.8195212116837501,
+ "num_tokens": 600256.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6900296103954315,
+ "epoch": 0.649772579597141,
+ "grad_norm": 0.6762415766716003,
+ "learning_rate": 0.00017898566409503577,
+ "loss": 0.6378536987304687,
+ "mean_token_accuracy": 0.8223087686300278,
+ "num_tokens": 738649.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.667421719878912,
+ "epoch": 0.7797270955165692,
+ "grad_norm": 0.5047685503959656,
+ "learning_rate": 0.00021492656049966144,
+ "loss": 0.6148524856567383,
+ "mean_token_accuracy": 0.8280292323231697,
+ "num_tokens": 883494.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.6388977643847465,
+ "epoch": 0.9096816114359974,
+ "grad_norm": 0.4360353350639343,
+ "learning_rate": 0.0002508674569042871,
+ "loss": 0.5933729553222656,
+ "mean_token_accuracy": 0.8329134130477905,
+ "num_tokens": 1032111.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6612381511009656,
+ "eval_loss": 0.6559221744537354,
+ "eval_mean_token_accuracy": 0.8195324820967821,
+ "eval_num_tokens": 1132140.0,
+ "eval_runtime": 53.4007,
+ "eval_samples_per_second": 31.03,
+ "eval_steps_per_second": 3.895,
+ "step": 385
+ },
+ {
+ "entropy": 0.6224366770916848,
+ "epoch": 1.0389863547758285,
+ "grad_norm": 0.5294668078422546,
+ "learning_rate": 0.00027673375518355765,
+ "loss": 0.5677951431274414,
+ "mean_token_accuracy": 0.8380465067211708,
+ "num_tokens": 1177556.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5827466724812984,
+ "epoch": 1.1689408706952567,
+ "grad_norm": 0.5172416567802429,
+ "learning_rate": 0.0002765120122346144,
+ "loss": 0.5423126983642578,
+ "mean_token_accuracy": 0.8467991036176682,
+ "num_tokens": 1325434.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5831517253816128,
+ "epoch": 1.2988953866146848,
+ "grad_norm": 0.41916292905807495,
+ "learning_rate": 0.0002760064270819138,
+ "loss": 0.534448013305664,
+ "mean_token_accuracy": 0.8456632816791534,
+ "num_tokens": 1474116.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5869986982643605,
+ "epoch": 1.428849902534113,
+ "grad_norm": 0.4387759566307068,
+ "learning_rate": 0.00027521803857633113,
+ "loss": 0.5367491912841796,
+ "mean_token_accuracy": 0.8462416216731071,
+ "num_tokens": 1621193.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5771756853163242,
+ "epoch": 1.5588044184535412,
+ "grad_norm": 0.49079665541648865,
+ "learning_rate": 0.00027414846665880935,
+ "loss": 0.5238623809814453,
+ "mean_token_accuracy": 0.84760089635849,
+ "num_tokens": 1767789.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5549105909466744,
+ "epoch": 1.6887589343729694,
+ "grad_norm": 0.4000363051891327,
+ "learning_rate": 0.0002727999090317863,
+ "loss": 0.510434226989746,
+ "mean_token_accuracy": 0.8517858856916427,
+ "num_tokens": 1918138.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.583413660377264,
+ "epoch": 1.8187134502923976,
+ "grad_norm": 0.33592426776885986,
+ "learning_rate": 0.00027117513664346674,
+ "loss": 0.5297993850708008,
+ "mean_token_accuracy": 0.846615691781044,
+ "num_tokens": 2057575.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.5732646904885769,
+ "epoch": 1.9486679662118258,
+ "grad_norm": 0.5528839230537415,
+ "learning_rate": 0.00026927748799421714,
+ "loss": 0.5219194793701172,
+ "mean_token_accuracy": 0.8489033079147339,
+ "num_tokens": 2208320.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6027900550801021,
+ "eval_loss": 0.5946928858757019,
+ "eval_mean_token_accuracy": 0.8318195798649237,
+ "eval_num_tokens": 2264280.0,
+ "eval_runtime": 53.3837,
+ "eval_samples_per_second": 31.039,
+ "eval_steps_per_second": 3.896,
+ "step": 770
+ },
+ {
+ "entropy": 0.5329899657611272,
+ "epoch": 2.077972709551657,
+ "grad_norm": 0.45793575048446655,
+ "learning_rate": 0.0002671108622767842,
+ "loss": 0.48420516967773436,
+ "mean_token_accuracy": 0.8578200301333289,
+ "num_tokens": 2348248.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5142687204480171,
+ "epoch": 2.207927225471085,
+ "grad_norm": 0.4690960645675659,
+ "learning_rate": 0.0002646797113644295,
+ "loss": 0.4593114471435547,
+ "mean_token_accuracy": 0.8622670090198516,
+ "num_tokens": 2501427.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5135884112119675,
+ "epoch": 2.3378817413905133,
+ "grad_norm": 0.3752821683883667,
+ "learning_rate": 0.00026198903066344565,
+ "loss": 0.4626216125488281,
+ "mean_token_accuracy": 0.8612511262297631,
+ "num_tokens": 2650794.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5137367483973503,
+ "epoch": 2.4678362573099415,
+ "grad_norm": 0.3726271390914917,
+ "learning_rate": 0.0002590443488488465,
+ "loss": 0.4601683807373047,
+ "mean_token_accuracy": 0.8620512077212333,
+ "num_tokens": 2798180.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5105714881420136,
+ "epoch": 2.5977907732293697,
+ "grad_norm": 0.41296717524528503,
+ "learning_rate": 0.00025585171650432525,
+ "loss": 0.46279102325439453,
+ "mean_token_accuracy": 0.8611763519048691,
+ "num_tokens": 2950301.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5169161760807037,
+ "epoch": 2.727745289148798,
+ "grad_norm": 0.4614253044128418,
+ "learning_rate": 0.0002524176936898197,
+ "loss": 0.45492774963378907,
+ "mean_token_accuracy": 0.8627680170536042,
+ "num_tokens": 3091810.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4989277676492929,
+ "epoch": 2.857699805068226,
+ "grad_norm": 0.37512704730033875,
+ "learning_rate": 0.00024874933646223225,
+ "loss": 0.4531984329223633,
+ "mean_token_accuracy": 0.8637665447592735,
+ "num_tokens": 3242184.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5177617704868317,
+ "epoch": 2.9876543209876543,
+ "grad_norm": 0.3700532019138336,
+ "learning_rate": 0.00024485418237699976,
+ "loss": 0.45844474792480466,
+ "mean_token_accuracy": 0.8626988258957863,
+ "num_tokens": 3382605.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5253989950108987,
+ "eval_loss": 0.5857328176498413,
+ "eval_mean_token_accuracy": 0.8360884573597175,
+ "eval_num_tokens": 3396420.0,
+ "eval_runtime": 53.341,
+ "eval_samples_per_second": 31.064,
+ "eval_steps_per_second": 3.899,
+ "step": 1155
+ },
+ {
+ "entropy": 0.4535361140517134,
+ "epoch": 3.116959064327485,
+ "grad_norm": 0.3412795662879944,
+ "learning_rate": 0.00024074023500030492,
+ "loss": 0.3942829132080078,
+ "mean_token_accuracy": 0.8781378038564519,
+ "num_tokens": 3522582.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.44747897461056707,
+ "epoch": 3.246913580246914,
+ "grad_norm": 0.46647050976753235,
+ "learning_rate": 0.0002364159474637521,
+ "loss": 0.38986759185791015,
+ "mean_token_accuracy": 0.8777281475067139,
+ "num_tokens": 3670864.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.4480265176296234,
+ "epoch": 3.3768680961663415,
+ "grad_norm": 0.4068582355976105,
+ "learning_rate": 0.00023189020509529866,
+ "loss": 0.39444759368896487,
+ "mean_token_accuracy": 0.8774515727162361,
+ "num_tokens": 3822021.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.45180007234215736,
+ "epoch": 3.50682261208577,
+ "grad_norm": 0.4249928593635559,
+ "learning_rate": 0.00022717230716213122,
+ "loss": 0.3977077102661133,
+ "mean_token_accuracy": 0.8762744688987731,
+ "num_tokens": 3968736.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4614932192862034,
+ "epoch": 3.636777128005198,
+ "grad_norm": 0.561008095741272,
+ "learning_rate": 0.00022227194776300045,
+ "loss": 0.4022808456420898,
+ "mean_token_accuracy": 0.8760285252332687,
+ "num_tokens": 4113509.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4414680179953575,
+ "epoch": 3.7667316439246266,
+ "grad_norm": 0.38943538069725037,
+ "learning_rate": 0.00021719919590927584,
+ "loss": 0.38586376190185545,
+ "mean_token_accuracy": 0.8783121705055237,
+ "num_tokens": 4267958.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.45685607343912127,
+ "epoch": 3.8966861598440543,
+ "grad_norm": 0.5362406969070435,
+ "learning_rate": 0.00021196447483564875,
+ "loss": 0.3983576583862305,
+ "mean_token_accuracy": 0.8764419692754746,
+ "num_tokens": 4415398.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49981127894268584,
+ "eval_loss": 0.5997208952903748,
+ "eval_mean_token_accuracy": 0.8368159819107789,
+ "eval_num_tokens": 4528560.0,
+ "eval_runtime": 53.4304,
+ "eval_samples_per_second": 31.012,
+ "eval_steps_per_second": 3.893,
+ "step": 1540
+ },
+ {
+ "entropy": 0.4439909208060509,
+ "epoch": 4.025990903183885,
+ "grad_norm": 0.5490113496780396,
+ "learning_rate": 0.00020657854058299564,
+ "loss": 0.38307292938232423,
+ "mean_token_accuracy": 0.8795150506436525,
+ "num_tokens": 4559534.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3837250977009535,
+ "epoch": 4.155945419103314,
+ "grad_norm": 0.5567234754562378,
+ "learning_rate": 0.0002010524598974076,
+ "loss": 0.3182963752746582,
+ "mean_token_accuracy": 0.8964017608761787,
+ "num_tokens": 4707075.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.377872094810009,
+ "epoch": 4.2858999350227425,
+ "grad_norm": 0.4315710961818695,
+ "learning_rate": 0.00019539758749079845,
+ "loss": 0.318333683013916,
+ "mean_token_accuracy": 0.8963816618919372,
+ "num_tokens": 4851683.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.38739304527640345,
+ "epoch": 4.41585445094217,
+ "grad_norm": 0.49140632152557373,
+ "learning_rate": 0.00018962554270981555,
+ "loss": 0.32688804626464846,
+ "mean_token_accuracy": 0.8937860554456711,
+ "num_tokens": 4994086.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.39157475270330905,
+ "epoch": 4.545808966861598,
+ "grad_norm": 0.40667369961738586,
+ "learning_rate": 0.00018374818566099208,
+ "loss": 0.3305763626098633,
+ "mean_token_accuracy": 0.8916732975840569,
+ "num_tokens": 5137171.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3838599680364132,
+ "epoch": 4.675763482781027,
+ "grad_norm": 0.4632417857646942,
+ "learning_rate": 0.0001777775928411983,
+ "loss": 0.3267818450927734,
+ "mean_token_accuracy": 0.8946500706672669,
+ "num_tokens": 5287076.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.38270787581801413,
+ "epoch": 4.805717998700455,
+ "grad_norm": 0.5529720187187195,
+ "learning_rate": 0.0001717260323234649,
+ "loss": 0.3264235305786133,
+ "mean_token_accuracy": 0.8948800846934318,
+ "num_tokens": 5436923.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.38736109718680384,
+ "epoch": 4.935672514619883,
+ "grad_norm": 0.5604785680770874,
+ "learning_rate": 0.00016560593854916497,
+ "loss": 0.3280513381958008,
+ "mean_token_accuracy": 0.8931388029456139,
+ "num_tokens": 5589195.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.4370518812479881,
+ "eval_loss": 0.6210553050041199,
+ "eval_mean_token_accuracy": 0.8379779781859654,
+ "eval_num_tokens": 5660700.0,
+ "eval_runtime": 53.4039,
+ "eval_samples_per_second": 31.028,
+ "eval_steps_per_second": 3.895,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3330705868988181,
+ "epoch": 5.064977257959714,
+ "grad_norm": 0.5386723875999451,
+ "learning_rate": 0.0001594298867783512,
+ "loss": 0.2754818344116211,
+ "mean_token_accuracy": 0.9101201346771202,
+ "num_tokens": 5739445.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2980620255321264,
+ "epoch": 5.1949317738791425,
+ "grad_norm": 0.5633581876754761,
+ "learning_rate": 0.00015321056725074549,
+ "loss": 0.23754241943359375,
+ "mean_token_accuracy": 0.9203532826900482,
+ "num_tokens": 5888043.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3111592583358288,
+ "epoch": 5.32488628979857,
+ "grad_norm": 0.5031015872955322,
+ "learning_rate": 0.0001469607591104745,
+ "loss": 0.24428102493286133,
+ "mean_token_accuracy": 0.917181601524353,
+ "num_tokens": 6031284.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.31522042460739613,
+ "epoch": 5.454840805717999,
+ "grad_norm": 0.6432453393936157,
+ "learning_rate": 0.0001406933041481286,
+ "loss": 0.25112478256225584,
+ "mean_token_accuracy": 0.9152472382783889,
+ "num_tokens": 6179927.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3046229027956724,
+ "epoch": 5.584795321637427,
+ "grad_norm": 0.5104537606239319,
+ "learning_rate": 0.00013442108041409814,
+ "loss": 0.2431495475769043,
+ "mean_token_accuracy": 0.917829519212246,
+ "num_tokens": 6322630.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.30440517760813235,
+ "epoch": 5.714749837556855,
+ "grad_norm": 0.5307765603065491,
+ "learning_rate": 0.0001281569757574053,
+ "loss": 0.24610313415527343,
+ "mean_token_accuracy": 0.9166415151953697,
+ "num_tokens": 6469843.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.304359400421381,
+ "epoch": 5.844704353476283,
+ "grad_norm": 0.5014523267745972,
+ "learning_rate": 0.00012191386134440133,
+ "loss": 0.24548973083496095,
+ "mean_token_accuracy": 0.9165477308630944,
+ "num_tokens": 6617768.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3141418205201626,
+ "epoch": 5.974658869395712,
+ "grad_norm": 0.567398726940155,
+ "learning_rate": 0.00011570456521174339,
+ "loss": 0.24975168228149414,
+ "mean_token_accuracy": 0.9139353120326996,
+ "num_tokens": 6761187.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.3758909202252443,
+ "eval_loss": 0.6843022108078003,
+ "eval_mean_token_accuracy": 0.8348075449466705,
+ "eval_num_tokens": 6792840.0,
+ "eval_runtime": 53.3825,
+ "eval_samples_per_second": 31.04,
+ "eval_steps_per_second": 3.896,
+ "step": 2310
+ },
+ {
+ "entropy": 0.24346149432000203,
+ "epoch": 6.1039636127355426,
+ "grad_norm": 0.7140825986862183,
+ "learning_rate": 0.00010954184590799172,
+ "loss": 0.17231273651123047,
+ "mean_token_accuracy": 0.9407721275660261,
+ "num_tokens": 6909578.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2160973483324051,
+ "epoch": 6.23391812865497,
+ "grad_norm": 0.49014952778816223,
+ "learning_rate": 0.00010343836627798716,
+ "loss": 0.15455107688903807,
+ "mean_token_accuracy": 0.9467655989527702,
+ "num_tokens": 7056244.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.21491924367845058,
+ "epoch": 6.363872644574399,
+ "grad_norm": 0.5529471635818481,
+ "learning_rate": 9.740666744387656e-05,
+ "loss": 0.1584029197692871,
+ "mean_token_accuracy": 0.9460993978381157,
+ "num_tokens": 7206950.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.22037068914622068,
+ "epoch": 6.493827160493828,
+ "grad_norm": 0.6232843995094299,
+ "learning_rate": 9.145914303624717e-05,
+ "loss": 0.15544342041015624,
+ "mean_token_accuracy": 0.9450622496008872,
+ "num_tokens": 7359429.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.2320463878661394,
+ "epoch": 6.623781676413255,
+ "grad_norm": 0.7459681630134583,
+ "learning_rate": 8.560801372831975e-05,
+ "loss": 0.16350215911865235,
+ "mean_token_accuracy": 0.9416968420147895,
+ "num_tokens": 7499281.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.22943626195192338,
+ "epoch": 6.753736192332683,
+ "grad_norm": 0.7482302784919739,
+ "learning_rate": 7.986530212552506e-05,
+ "loss": 0.16422538757324218,
+ "mean_token_accuracy": 0.9434959614276885,
+ "num_tokens": 7640758.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.21795938543975354,
+ "epoch": 6.883690708252112,
+ "grad_norm": 0.5210486054420471,
+ "learning_rate": 7.424280806206118e-05,
+ "loss": 0.15540474891662598,
+ "mean_token_accuracy": 0.9459306105971337,
+ "num_tokens": 7791986.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3157534837149657,
+ "eval_loss": 0.7744874954223633,
+ "eval_mean_token_accuracy": 0.8346395036922052,
+ "eval_num_tokens": 7924980.0,
+ "eval_runtime": 53.3771,
+ "eval_samples_per_second": 31.043,
+ "eval_steps_per_second": 3.897,
+ "step": 2695
+ },
+ {
+ "entropy": 0.2124774060656677,
+ "epoch": 7.012995451591943,
+ "grad_norm": 0.42238789796829224,
+ "learning_rate": 6.875208435518865e-05,
+ "loss": 0.14792531967163086,
+ "mean_token_accuracy": 0.9490461115861059,
+ "num_tokens": 7940521.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.15692965138703585,
+ "epoch": 7.142949967511371,
+ "grad_norm": 0.4711572229862213,
+ "learning_rate": 6.340441306708468e-05,
+ "loss": 0.09051708221435546,
+ "mean_token_accuracy": 0.9700166273117066,
+ "num_tokens": 8084193.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.15241683423519134,
+ "epoch": 7.272904483430799,
+ "grad_norm": 0.4312196671962738,
+ "learning_rate": 5.821078232303016e-05,
+ "loss": 0.08812363624572754,
+ "mean_token_accuracy": 0.9699361199140548,
+ "num_tokens": 8230159.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.1459079357981682,
+ "epoch": 7.402858999350228,
+ "grad_norm": 0.4804084002971649,
+ "learning_rate": 5.3181863733564636e-05,
+ "loss": 0.08675944328308105,
+ "mean_token_accuracy": 0.9703072866797448,
+ "num_tokens": 8380556.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.15621621005237102,
+ "epoch": 7.532813515269655,
+ "grad_norm": 0.5435478091239929,
+ "learning_rate": 4.83279904669986e-05,
+ "loss": 0.09016354560852051,
+ "mean_token_accuracy": 0.9674961140751839,
+ "num_tokens": 8523248.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.1545175113901496,
+ "epoch": 7.662768031189084,
+ "grad_norm": 0.524286687374115,
+ "learning_rate": 4.365913601734056e-05,
+ "loss": 0.09049141883850098,
+ "mean_token_accuracy": 0.9679373624920845,
+ "num_tokens": 8672002.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.1553485019877553,
+ "epoch": 7.792722547108512,
+ "grad_norm": 0.5006484389305115,
+ "learning_rate": 3.9184893711264495e-05,
+ "loss": 0.08913107872009278,
+ "mean_token_accuracy": 0.9684559822082519,
+ "num_tokens": 8816090.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.15444331549108029,
+ "epoch": 7.92267706302794,
+ "grad_norm": 0.5613893866539001,
+ "learning_rate": 3.491445699622611e-05,
+ "loss": 0.08711207389831543,
+ "mean_token_accuracy": 0.9684525722265244,
+ "num_tokens": 8966004.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.26580205430778175,
+ "eval_loss": 0.8996392488479614,
+ "eval_mean_token_accuracy": 0.8319417791297803,
+ "eval_num_tokens": 9057120.0,
+ "eval_runtime": 53.4042,
+ "eval_samples_per_second": 31.028,
+ "eval_steps_per_second": 3.895,
+ "step": 3080
+ },
+ {
+ "entropy": 0.14060429509860187,
+ "epoch": 8.05198180636777,
+ "grad_norm": 0.25378674268722534,
+ "learning_rate": 3.085660055023035e-05,
+ "loss": 0.07468742847442628,
+ "mean_token_accuracy": 0.9737296260181983,
+ "num_tokens": 9115929.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.12152639016509056,
+ "epoch": 8.1819363222872,
+ "grad_norm": 0.3634810447692871,
+ "learning_rate": 2.7019662252065798e-05,
+ "loss": 0.05918361663818359,
+ "mean_token_accuracy": 0.9785374769568443,
+ "num_tokens": 9260646.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.1228457903675735,
+ "epoch": 8.311890838206628,
+ "grad_norm": 0.32873299717903137,
+ "learning_rate": 2.3411526049051643e-05,
+ "loss": 0.060924801826477054,
+ "mean_token_accuracy": 0.9778030979633331,
+ "num_tokens": 9404737.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.11790491977706552,
+ "epoch": 8.441845354126055,
+ "grad_norm": 0.27871423959732056,
+ "learning_rate": 2.0039605757500512e-05,
+ "loss": 0.05871880531311035,
+ "mean_token_accuracy": 0.9786837643384934,
+ "num_tokens": 9552241.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.12911307733505964,
+ "epoch": 8.571799870045485,
+ "grad_norm": 0.4569152593612671,
+ "learning_rate": 1.691082982918235e-05,
+ "loss": 0.06450970649719238,
+ "mean_token_accuracy": 0.9761316785216332,
+ "num_tokens": 9689407.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.11636774389073253,
+ "epoch": 8.701754385964913,
+ "grad_norm": 0.27477338910102844,
+ "learning_rate": 1.403162711509129e-05,
+ "loss": 0.05784036159515381,
+ "mean_token_accuracy": 0.9791285961866378,
+ "num_tokens": 9842204.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.11710284009575844,
+ "epoch": 8.83170890188434,
+ "grad_norm": 0.26900890469551086,
+ "learning_rate": 1.1407913655766755e-05,
+ "loss": 0.05737146377563476,
+ "mean_token_accuracy": 0.9788037702441216,
+ "num_tokens": 9994524.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.11474769543856382,
+ "epoch": 8.961663417803768,
+ "grad_norm": 0.3137633204460144,
+ "learning_rate": 9.045080525311815e-06,
+ "loss": 0.057830405235290525,
+ "mean_token_accuracy": 0.9789029136300087,
+ "num_tokens": 10147597.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.23872991701444754,
+ "eval_loss": 1.007972002029419,
+ "eval_mean_token_accuracy": 0.8318104110658169,
+ "eval_num_tokens": 10189260.0,
+ "eval_runtime": 53.4047,
+ "eval_samples_per_second": 31.027,
+ "eval_steps_per_second": 3.895,
+ "step": 3465
+ },
+ {
+ "entropy": 0.10747776249769944,
+ "epoch": 9.0909681611436,
+ "grad_norm": 0.21027140319347382,
+ "learning_rate": 6.9479827540858e-06,
+ "loss": 0.05341584682464599,
+ "mean_token_accuracy": 0.9819158309668152,
+ "num_tokens": 10294278.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.10646019088104368,
+ "epoch": 9.220922677063028,
+ "grad_norm": 0.1820213794708252,
+ "learning_rate": 5.120929352832946e-06,
+ "loss": 0.04993132591247559,
+ "mean_token_accuracy": 0.9813734939694405,
+ "num_tokens": 10446433.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.11276215925812721,
+ "epoch": 9.350877192982455,
+ "grad_norm": 0.28350868821144104,
+ "learning_rate": 3.5676744587442527e-06,
+ "loss": 0.05132147789001465,
+ "mean_token_accuracy": 0.9798016020655632,
+ "num_tokens": 10596480.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.11684846783056856,
+ "epoch": 9.480831708901885,
+ "grad_norm": 0.24154022336006165,
+ "learning_rate": 2.2914096216458985e-06,
+ "loss": 0.05330245018005371,
+ "mean_token_accuracy": 0.9795207896828652,
+ "num_tokens": 10740528.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.11312318585813046,
+ "epoch": 9.610786224821313,
+ "grad_norm": 0.28576743602752686,
+ "learning_rate": 1.2947572461634096e-06,
+ "loss": 0.05499160289764404,
+ "mean_token_accuracy": 0.9797447052598,
+ "num_tokens": 10881750.0,
+ "step": 3700
+ },
+ {
+ "entropy": 0.11110325066372752,
+ "epoch": 9.74074074074074,
+ "grad_norm": 0.22744601964950562,
+ "learning_rate": 5.79765203336998e-07,
+ "loss": 0.05335733413696289,
+ "mean_token_accuracy": 0.9804390069842338,
+ "num_tokens": 11027552.0,
+ "step": 3750
+ },
+ {
+ "entropy": 0.11025484301149845,
+ "epoch": 9.870695256660168,
+ "grad_norm": 0.2376026064157486,
+ "learning_rate": 1.4790262275940392e-07,
+ "loss": 0.053631534576416014,
+ "mean_token_accuracy": 0.9807139033079147,
+ "num_tokens": 11172587.0,
+ "step": 3800
+ },
+ {
+ "entropy": 0.11222033412312743,
+ "epoch": 10.0,
+ "grad_norm": 0.22163568437099457,
+ "learning_rate": 5.6873882486966634e-11,
+ "loss": 0.05134555339813232,
+ "mean_token_accuracy": 0.9805273500519183,
+ "num_tokens": 11321400.0,
+ "step": 3850
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.23073764632527644,
+ "eval_loss": 1.059328556060791,
+ "eval_mean_token_accuracy": 0.8315239631785796,
+ "eval_num_tokens": 11321400.0,
+ "eval_runtime": 53.4589,
+ "eval_samples_per_second": 30.996,
+ "eval_steps_per_second": 3.891,
+ "step": 3850
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.925137653703373e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ee79256b30e387ac41d2786d79a749c70114aaa
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1
+tags:
+- generated_from_trainer
+- sft
+- trl
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ranbbj8x)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test1/checkpoint-1122/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f9710c7923482a2e6cb286018e55b18316a2856
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2
+tags:
+- generated_from_trainer
+- sft
+- trl
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/y6cm94yy)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7efdc248db95c46bb6a9070adb343c159b1896cc
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1122/trainer_state.json
@@ -0,0 +1,287 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 1122,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.23531258243029e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8579a5b3822c16431e12b7178ef55a17371e2072
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1496/trainer_state.json
@@ -0,0 +1,368 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 1496,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.647043150972508e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1325a869fb61802392682d92bac5b6563c8c2143
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-1870/trainer_state.json
@@ -0,0 +1,459 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 1870,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.0568507616161485e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..838942db2ba3089518ff6c523ad22f7c0150b924
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2244/trainer_state.json
@@ -0,0 +1,540 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 2244,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2832252390005372,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.3697633147239685,
+ "learning_rate": 0.00013410282204620014,
+ "loss": 0.2279021453857422,
+ "mean_token_accuracy": 0.9252248072262966,
+ "num_tokens": 4879271.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.250804705247283,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.3890162706375122,
+ "learning_rate": 0.00012868890261055722,
+ "loss": 0.1980854606628418,
+ "mean_token_accuracy": 0.9338876655697823,
+ "num_tokens": 5005076.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2531572911888361,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.43466225266456604,
+ "learning_rate": 0.0001232493645026623,
+ "loss": 0.20114482879638673,
+ "mean_token_accuracy": 0.9317018255591393,
+ "num_tokens": 5133591.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.25918263107538225,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.38253673911094666,
+ "learning_rate": 0.00011779605162265297,
+ "loss": 0.2056061363220215,
+ "mean_token_accuracy": 0.9302830925583839,
+ "num_tokens": 5257252.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.2553627458959818,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.4536231458187103,
+ "learning_rate": 0.00011234083786347563,
+ "loss": 0.20531394958496094,
+ "mean_token_accuracy": 0.9302299374341965,
+ "num_tokens": 5388652.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.2575570110231638,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.36399731040000916,
+ "learning_rate": 0.00010689560125699833,
+ "loss": 0.2048162841796875,
+ "mean_token_accuracy": 0.9306997761130333,
+ "num_tokens": 5515488.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.24660897620022296,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.43602702021598816,
+ "learning_rate": 0.00010147219811111233,
+ "loss": 0.1986431884765625,
+ "mean_token_accuracy": 0.9335029146075249,
+ "num_tokens": 5644323.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.32280903935432437,
+ "eval_loss": 0.6383674144744873,
+ "eval_mean_token_accuracy": 0.849581449329853,
+ "eval_num_tokens": 5758458.0,
+ "eval_runtime": 51.3251,
+ "eval_samples_per_second": 31.135,
+ "eval_steps_per_second": 3.897,
+ "step": 2244
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.465057375940987e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..98c363caf2498eca36506878bf632fd3e3a40abe
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2618/trainer_state.json
@@ -0,0 +1,631 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.0,
+ "eval_steps": 500,
+ "global_step": 2618,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2832252390005372,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.3697633147239685,
+ "learning_rate": 0.00013410282204620014,
+ "loss": 0.2279021453857422,
+ "mean_token_accuracy": 0.9252248072262966,
+ "num_tokens": 4879271.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.250804705247283,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.3890162706375122,
+ "learning_rate": 0.00012868890261055722,
+ "loss": 0.1980854606628418,
+ "mean_token_accuracy": 0.9338876655697823,
+ "num_tokens": 5005076.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2531572911888361,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.43466225266456604,
+ "learning_rate": 0.0001232493645026623,
+ "loss": 0.20114482879638673,
+ "mean_token_accuracy": 0.9317018255591393,
+ "num_tokens": 5133591.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.25918263107538225,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.38253673911094666,
+ "learning_rate": 0.00011779605162265297,
+ "loss": 0.2056061363220215,
+ "mean_token_accuracy": 0.9302830925583839,
+ "num_tokens": 5257252.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.2553627458959818,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.4536231458187103,
+ "learning_rate": 0.00011234083786347563,
+ "loss": 0.20531394958496094,
+ "mean_token_accuracy": 0.9302299374341965,
+ "num_tokens": 5388652.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.2575570110231638,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.36399731040000916,
+ "learning_rate": 0.00010689560125699833,
+ "loss": 0.2048162841796875,
+ "mean_token_accuracy": 0.9306997761130333,
+ "num_tokens": 5515488.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.24660897620022296,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.43602702021598816,
+ "learning_rate": 0.00010147219811111233,
+ "loss": 0.1986431884765625,
+ "mean_token_accuracy": 0.9335029146075249,
+ "num_tokens": 5644323.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.32280903935432437,
+ "eval_loss": 0.6383674144744873,
+ "eval_mean_token_accuracy": 0.849581449329853,
+ "eval_num_tokens": 5758458.0,
+ "eval_runtime": 51.3251,
+ "eval_samples_per_second": 31.135,
+ "eval_steps_per_second": 3.897,
+ "step": 2244
+ },
+ {
+ "entropy": 0.24638524448329752,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.3538878262042999,
+ "learning_rate": 9.608243719413435e-05,
+ "loss": 0.19203664779663085,
+ "mean_token_accuracy": 0.9353304363862432,
+ "num_tokens": 5773027.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.17814311504364014,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.3886161148548126,
+ "learning_rate": 9.07380540227205e-05,
+ "loss": 0.12442682266235351,
+ "mean_token_accuracy": 0.9582101872563362,
+ "num_tokens": 5904840.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.17237136442214251,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.38807374238967896,
+ "learning_rate": 8.545068530927622e-05,
+ "loss": 0.12445520401000977,
+ "mean_token_accuracy": 0.9580146077275277,
+ "num_tokens": 6037457.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.18334724467247723,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.48334068059921265,
+ "learning_rate": 8.023184362449975e-05,
+ "loss": 0.12853397369384767,
+ "mean_token_accuracy": 0.956232733130455,
+ "num_tokens": 6161042.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.17894859783351422,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3343403935432434,
+ "learning_rate": 7.509289233022861e-05,
+ "loss": 0.12605968475341797,
+ "mean_token_accuracy": 0.9566894540190697,
+ "num_tokens": 6291748.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.17900108266621828,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.41615480184555054,
+ "learning_rate": 7.00450208371691e-05,
+ "loss": 0.12843725204467774,
+ "mean_token_accuracy": 0.956638223528862,
+ "num_tokens": 6419265.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.18341445792466401,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.3722545802593231,
+ "learning_rate": 6.509922024138231e-05,
+ "loss": 0.13251185417175293,
+ "mean_token_accuracy": 0.9549383011460304,
+ "num_tokens": 6544758.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.17789534136652946,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.33068087697029114,
+ "learning_rate": 6.02662593925748e-05,
+ "loss": 0.126302547454834,
+ "mean_token_accuracy": 0.9568320420384407,
+ "num_tokens": 6674626.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.2639452085644007,
+ "eval_loss": 0.7510635852813721,
+ "eval_mean_token_accuracy": 0.8456276795268058,
+ "eval_num_tokens": 6718201.0,
+ "eval_runtime": 51.3435,
+ "eval_samples_per_second": 31.124,
+ "eval_steps_per_second": 3.895,
+ "step": 2618
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.8789404553397146e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0e6b6ba6e3b30989caa6b4be5d65f59c2c24c39
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-2992/trainer_state.json
@@ -0,0 +1,712 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 500,
+ "global_step": 2992,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2832252390005372,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.3697633147239685,
+ "learning_rate": 0.00013410282204620014,
+ "loss": 0.2279021453857422,
+ "mean_token_accuracy": 0.9252248072262966,
+ "num_tokens": 4879271.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.250804705247283,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.3890162706375122,
+ "learning_rate": 0.00012868890261055722,
+ "loss": 0.1980854606628418,
+ "mean_token_accuracy": 0.9338876655697823,
+ "num_tokens": 5005076.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2531572911888361,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.43466225266456604,
+ "learning_rate": 0.0001232493645026623,
+ "loss": 0.20114482879638673,
+ "mean_token_accuracy": 0.9317018255591393,
+ "num_tokens": 5133591.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.25918263107538225,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.38253673911094666,
+ "learning_rate": 0.00011779605162265297,
+ "loss": 0.2056061363220215,
+ "mean_token_accuracy": 0.9302830925583839,
+ "num_tokens": 5257252.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.2553627458959818,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.4536231458187103,
+ "learning_rate": 0.00011234083786347563,
+ "loss": 0.20531394958496094,
+ "mean_token_accuracy": 0.9302299374341965,
+ "num_tokens": 5388652.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.2575570110231638,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.36399731040000916,
+ "learning_rate": 0.00010689560125699833,
+ "loss": 0.2048162841796875,
+ "mean_token_accuracy": 0.9306997761130333,
+ "num_tokens": 5515488.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.24660897620022296,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.43602702021598816,
+ "learning_rate": 0.00010147219811111233,
+ "loss": 0.1986431884765625,
+ "mean_token_accuracy": 0.9335029146075249,
+ "num_tokens": 5644323.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.32280903935432437,
+ "eval_loss": 0.6383674144744873,
+ "eval_mean_token_accuracy": 0.849581449329853,
+ "eval_num_tokens": 5758458.0,
+ "eval_runtime": 51.3251,
+ "eval_samples_per_second": 31.135,
+ "eval_steps_per_second": 3.897,
+ "step": 2244
+ },
+ {
+ "entropy": 0.24638524448329752,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.3538878262042999,
+ "learning_rate": 9.608243719413435e-05,
+ "loss": 0.19203664779663085,
+ "mean_token_accuracy": 0.9353304363862432,
+ "num_tokens": 5773027.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.17814311504364014,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.3886161148548126,
+ "learning_rate": 9.07380540227205e-05,
+ "loss": 0.12442682266235351,
+ "mean_token_accuracy": 0.9582101872563362,
+ "num_tokens": 5904840.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.17237136442214251,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.38807374238967896,
+ "learning_rate": 8.545068530927622e-05,
+ "loss": 0.12445520401000977,
+ "mean_token_accuracy": 0.9580146077275277,
+ "num_tokens": 6037457.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.18334724467247723,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.48334068059921265,
+ "learning_rate": 8.023184362449975e-05,
+ "loss": 0.12853397369384767,
+ "mean_token_accuracy": 0.956232733130455,
+ "num_tokens": 6161042.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.17894859783351422,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3343403935432434,
+ "learning_rate": 7.509289233022861e-05,
+ "loss": 0.12605968475341797,
+ "mean_token_accuracy": 0.9566894540190697,
+ "num_tokens": 6291748.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.17900108266621828,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.41615480184555054,
+ "learning_rate": 7.00450208371691e-05,
+ "loss": 0.12843725204467774,
+ "mean_token_accuracy": 0.956638223528862,
+ "num_tokens": 6419265.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.18341445792466401,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.3722545802593231,
+ "learning_rate": 6.509922024138231e-05,
+ "loss": 0.13251185417175293,
+ "mean_token_accuracy": 0.9549383011460304,
+ "num_tokens": 6544758.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.17789534136652946,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.33068087697029114,
+ "learning_rate": 6.02662593925748e-05,
+ "loss": 0.126302547454834,
+ "mean_token_accuracy": 0.9568320420384407,
+ "num_tokens": 6674626.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.2639452085644007,
+ "eval_loss": 0.7510635852813721,
+ "eval_mean_token_accuracy": 0.8456276795268058,
+ "eval_num_tokens": 6718201.0,
+ "eval_runtime": 51.3435,
+ "eval_samples_per_second": 31.124,
+ "eval_steps_per_second": 3.895,
+ "step": 2618
+ },
+ {
+ "entropy": 0.15149398003187445,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.3362366855144501,
+ "learning_rate": 5.5556661446302733e-05,
+ "loss": 0.09618576049804688,
+ "mean_token_accuracy": 0.9674260706612559,
+ "num_tokens": 6803830.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.1279136904887855,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2925446033477783,
+ "learning_rate": 5.0980680951143166e-05,
+ "loss": 0.07902004718780517,
+ "mean_token_accuracy": 0.9733691918849945,
+ "num_tokens": 6936289.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.13701909594237804,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.2659797668457031,
+ "learning_rate": 4.6548281520723104e-05,
+ "loss": 0.08250561714172364,
+ "mean_token_accuracy": 0.971816695034504,
+ "num_tokens": 7057823.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.12771729078143834,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.34247028827667236,
+ "learning_rate": 4.2269114139222296e-05,
+ "loss": 0.08109721183776855,
+ "mean_token_accuracy": 0.9735026282072067,
+ "num_tokens": 7187020.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.12350119687616826,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.40673893690109253,
+ "learning_rate": 3.8152496147586614e-05,
+ "loss": 0.07707037448883057,
+ "mean_token_accuracy": 0.9743763041496277,
+ "num_tokens": 7323457.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.12642662361264229,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.3064998984336853,
+ "learning_rate": 3.4207390956206875e-05,
+ "loss": 0.07985133647918702,
+ "mean_token_accuracy": 0.9733496251702308,
+ "num_tokens": 7451765.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.1269074559956789,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.24833732843399048,
+ "learning_rate": 3.0442388528236647e-05,
+ "loss": 0.0821513843536377,
+ "mean_token_accuracy": 0.9733479696512223,
+ "num_tokens": 7578222.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2316589906811714,
+ "eval_loss": 0.851987898349762,
+ "eval_mean_token_accuracy": 0.8467304027080536,
+ "eval_num_tokens": 7677944.0,
+ "eval_runtime": 51.3501,
+ "eval_samples_per_second": 31.12,
+ "eval_steps_per_second": 3.895,
+ "step": 2992
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.2928370910154854e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..54835173496a2e4cee9b6d7268fd47ce2694cd87
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3366/trainer_state.json
@@ -0,0 +1,803 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3366,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2832252390005372,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.3697633147239685,
+ "learning_rate": 0.00013410282204620014,
+ "loss": 0.2279021453857422,
+ "mean_token_accuracy": 0.9252248072262966,
+ "num_tokens": 4879271.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.250804705247283,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.3890162706375122,
+ "learning_rate": 0.00012868890261055722,
+ "loss": 0.1980854606628418,
+ "mean_token_accuracy": 0.9338876655697823,
+ "num_tokens": 5005076.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2531572911888361,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.43466225266456604,
+ "learning_rate": 0.0001232493645026623,
+ "loss": 0.20114482879638673,
+ "mean_token_accuracy": 0.9317018255591393,
+ "num_tokens": 5133591.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.25918263107538225,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.38253673911094666,
+ "learning_rate": 0.00011779605162265297,
+ "loss": 0.2056061363220215,
+ "mean_token_accuracy": 0.9302830925583839,
+ "num_tokens": 5257252.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.2553627458959818,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.4536231458187103,
+ "learning_rate": 0.00011234083786347563,
+ "loss": 0.20531394958496094,
+ "mean_token_accuracy": 0.9302299374341965,
+ "num_tokens": 5388652.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.2575570110231638,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.36399731040000916,
+ "learning_rate": 0.00010689560125699833,
+ "loss": 0.2048162841796875,
+ "mean_token_accuracy": 0.9306997761130333,
+ "num_tokens": 5515488.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.24660897620022296,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.43602702021598816,
+ "learning_rate": 0.00010147219811111233,
+ "loss": 0.1986431884765625,
+ "mean_token_accuracy": 0.9335029146075249,
+ "num_tokens": 5644323.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.32280903935432437,
+ "eval_loss": 0.6383674144744873,
+ "eval_mean_token_accuracy": 0.849581449329853,
+ "eval_num_tokens": 5758458.0,
+ "eval_runtime": 51.3251,
+ "eval_samples_per_second": 31.135,
+ "eval_steps_per_second": 3.897,
+ "step": 2244
+ },
+ {
+ "entropy": 0.24638524448329752,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.3538878262042999,
+ "learning_rate": 9.608243719413435e-05,
+ "loss": 0.19203664779663085,
+ "mean_token_accuracy": 0.9353304363862432,
+ "num_tokens": 5773027.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.17814311504364014,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.3886161148548126,
+ "learning_rate": 9.07380540227205e-05,
+ "loss": 0.12442682266235351,
+ "mean_token_accuracy": 0.9582101872563362,
+ "num_tokens": 5904840.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.17237136442214251,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.38807374238967896,
+ "learning_rate": 8.545068530927622e-05,
+ "loss": 0.12445520401000977,
+ "mean_token_accuracy": 0.9580146077275277,
+ "num_tokens": 6037457.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.18334724467247723,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.48334068059921265,
+ "learning_rate": 8.023184362449975e-05,
+ "loss": 0.12853397369384767,
+ "mean_token_accuracy": 0.956232733130455,
+ "num_tokens": 6161042.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.17894859783351422,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3343403935432434,
+ "learning_rate": 7.509289233022861e-05,
+ "loss": 0.12605968475341797,
+ "mean_token_accuracy": 0.9566894540190697,
+ "num_tokens": 6291748.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.17900108266621828,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.41615480184555054,
+ "learning_rate": 7.00450208371691e-05,
+ "loss": 0.12843725204467774,
+ "mean_token_accuracy": 0.956638223528862,
+ "num_tokens": 6419265.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.18341445792466401,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.3722545802593231,
+ "learning_rate": 6.509922024138231e-05,
+ "loss": 0.13251185417175293,
+ "mean_token_accuracy": 0.9549383011460304,
+ "num_tokens": 6544758.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.17789534136652946,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.33068087697029114,
+ "learning_rate": 6.02662593925748e-05,
+ "loss": 0.126302547454834,
+ "mean_token_accuracy": 0.9568320420384407,
+ "num_tokens": 6674626.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.2639452085644007,
+ "eval_loss": 0.7510635852813721,
+ "eval_mean_token_accuracy": 0.8456276795268058,
+ "eval_num_tokens": 6718201.0,
+ "eval_runtime": 51.3435,
+ "eval_samples_per_second": 31.124,
+ "eval_steps_per_second": 3.895,
+ "step": 2618
+ },
+ {
+ "entropy": 0.15149398003187445,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.3362366855144501,
+ "learning_rate": 5.5556661446302733e-05,
+ "loss": 0.09618576049804688,
+ "mean_token_accuracy": 0.9674260706612559,
+ "num_tokens": 6803830.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.1279136904887855,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2925446033477783,
+ "learning_rate": 5.0980680951143166e-05,
+ "loss": 0.07902004718780517,
+ "mean_token_accuracy": 0.9733691918849945,
+ "num_tokens": 6936289.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.13701909594237804,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.2659797668457031,
+ "learning_rate": 4.6548281520723104e-05,
+ "loss": 0.08250561714172364,
+ "mean_token_accuracy": 0.971816695034504,
+ "num_tokens": 7057823.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.12771729078143834,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.34247028827667236,
+ "learning_rate": 4.2269114139222296e-05,
+ "loss": 0.08109721183776855,
+ "mean_token_accuracy": 0.9735026282072067,
+ "num_tokens": 7187020.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.12350119687616826,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.40673893690109253,
+ "learning_rate": 3.8152496147586614e-05,
+ "loss": 0.07707037448883057,
+ "mean_token_accuracy": 0.9743763041496277,
+ "num_tokens": 7323457.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.12642662361264229,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.3064998984336853,
+ "learning_rate": 3.4207390956206875e-05,
+ "loss": 0.07985133647918702,
+ "mean_token_accuracy": 0.9733496251702308,
+ "num_tokens": 7451765.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.1269074559956789,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.24833732843399048,
+ "learning_rate": 3.0442388528236647e-05,
+ "loss": 0.0821513843536377,
+ "mean_token_accuracy": 0.9733479696512223,
+ "num_tokens": 7578222.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2316589906811714,
+ "eval_loss": 0.851987898349762,
+ "eval_mean_token_accuracy": 0.8467304027080536,
+ "eval_num_tokens": 7677944.0,
+ "eval_runtime": 51.3501,
+ "eval_samples_per_second": 31.12,
+ "eval_steps_per_second": 3.895,
+ "step": 2992
+ },
+ {
+ "entropy": 0.13051860011888272,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.13740092515945435,
+ "learning_rate": 2.686568667604363e-05,
+ "loss": 0.08182425498962402,
+ "mean_token_accuracy": 0.9728358243450974,
+ "num_tokens": 7699511.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.11715538412332535,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.22101899981498718,
+ "learning_rate": 2.3485073211519044e-05,
+ "loss": 0.06548665523529053,
+ "mean_token_accuracy": 0.9781955161690712,
+ "num_tokens": 7822731.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.10994385546073318,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.18131813406944275,
+ "learning_rate": 2.0307908989111124e-05,
+ "loss": 0.06045622825622558,
+ "mean_token_accuracy": 0.9784942081570626,
+ "num_tokens": 7954237.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.10547435775399208,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.219278946518898,
+ "learning_rate": 1.734111187850385e-05,
+ "loss": 0.06194626808166504,
+ "mean_token_accuracy": 0.979062694311142,
+ "num_tokens": 8082999.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.10694314314052462,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.21804827451705933,
+ "learning_rate": 1.4591141701838324e-05,
+ "loss": 0.06162384033203125,
+ "mean_token_accuracy": 0.9781702619791031,
+ "num_tokens": 8214329.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.10566199742257595,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.12687279284000397,
+ "learning_rate": 1.2063986168274383e-05,
+ "loss": 0.06172010898590088,
+ "mean_token_accuracy": 0.9794953766465188,
+ "num_tokens": 8345134.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.11023524977266788,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.14195536077022552,
+ "learning_rate": 9.765147836518029e-06,
+ "loss": 0.06327592372894288,
+ "mean_token_accuracy": 0.9780235534906387,
+ "num_tokens": 8470788.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.11061601843684912,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.17863383889198303,
+ "learning_rate": 7.699632133701809e-06,
+ "loss": 0.0625003719329834,
+ "mean_token_accuracy": 0.9776859974861145,
+ "num_tokens": 8599154.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2094542646408081,
+ "eval_loss": 0.9543951153755188,
+ "eval_mean_token_accuracy": 0.8449241068959236,
+ "eval_num_tokens": 8637687.0,
+ "eval_runtime": 51.3693,
+ "eval_samples_per_second": 31.108,
+ "eval_steps_per_second": 3.893,
+ "step": 3366
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.702890522149509e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ffcde7d23349a3f5589aa30f82fa513a4c60539
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-374/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 374,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.089940080830976e+16,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6713ff31dddbc6400cf45a32cb8ebee6249ce8
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-3740/trainer_state.json
@@ -0,0 +1,884 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3740,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ },
+ {
+ "entropy": 0.5069879525237613,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.21533548831939697,
+ "learning_rate": 0.00022675737918370628,
+ "loss": 0.4585062026977539,
+ "mean_token_accuracy": 0.865652882390552,
+ "num_tokens": 1925307.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4457989126443863,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.24705350399017334,
+ "learning_rate": 0.00022476750117512737,
+ "loss": 0.4026710891723633,
+ "mean_token_accuracy": 0.8774338760972022,
+ "num_tokens": 2055627.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.45084999009966853,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.26643863320350647,
+ "learning_rate": 0.00022254280560567822,
+ "loss": 0.40950340270996094,
+ "mean_token_accuracy": 0.8752825647592545,
+ "num_tokens": 2177850.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4516983331739903,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.26972198486328125,
+ "learning_rate": 0.00022008813646608725,
+ "loss": 0.4115512466430664,
+ "mean_token_accuracy": 0.8761435833573341,
+ "num_tokens": 2304612.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.45280722543597224,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.2643600106239319,
+ "learning_rate": 0.00021740883848518684,
+ "loss": 0.41181053161621095,
+ "mean_token_accuracy": 0.8756420350074768,
+ "num_tokens": 2430946.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4487862553447485,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.2849285304546356,
+ "learning_rate": 0.00021451074549244846,
+ "loss": 0.4094270706176758,
+ "mean_token_accuracy": 0.8771369129419326,
+ "num_tokens": 2557241.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.45000545382499696,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.24081671237945557,
+ "learning_rate": 0.0002114001677155633,
+ "loss": 0.4073855972290039,
+ "mean_token_accuracy": 0.8779223081469536,
+ "num_tokens": 2692775.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4576124830543995,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.2341010719537735,
+ "learning_rate": 0.00020808387804072673,
+ "loss": 0.4154107666015625,
+ "mean_token_accuracy": 0.8756425747275353,
+ "num_tokens": 2823060.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4812743577361107,
+ "eval_loss": 0.523208498954773,
+ "eval_mean_token_accuracy": 0.8494922530651092,
+ "eval_num_tokens": 2879229.0,
+ "eval_runtime": 51.3707,
+ "eval_samples_per_second": 31.107,
+ "eval_steps_per_second": 3.893,
+ "step": 1122
+ },
+ {
+ "entropy": 0.41342759042075183,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.34829744696617126,
+ "learning_rate": 0.0002045690972655427,
+ "loss": 0.3644887542724609,
+ "mean_token_accuracy": 0.8879408977850519,
+ "num_tokens": 2955424.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.3948360003530979,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.312427282333374,
+ "learning_rate": 0.00020086347837665854,
+ "loss": 0.34146129608154296,
+ "mean_token_accuracy": 0.8914449456334114,
+ "num_tokens": 3078799.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.38602679744362833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.2715625762939453,
+ "learning_rate": 0.0001969750898863629,
+ "loss": 0.34105979919433593,
+ "mean_token_accuracy": 0.8928995525836945,
+ "num_tokens": 3211945.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.3923524462431669,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.27295640110969543,
+ "learning_rate": 0.00019291239826442992,
+ "loss": 0.3473458099365234,
+ "mean_token_accuracy": 0.8913829082250595,
+ "num_tokens": 3343933.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.40172914519906044,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.32684192061424255,
+ "learning_rate": 0.0001886842495034615,
+ "loss": 0.35543827056884764,
+ "mean_token_accuracy": 0.8904094022512435,
+ "num_tokens": 3470087.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4010512103140354,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.23922984302043915,
+ "learning_rate": 0.00018429984985786734,
+ "loss": 0.3535212326049805,
+ "mean_token_accuracy": 0.8891122484207153,
+ "num_tokens": 3590858.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.39226082623004915,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.25130993127822876,
+ "learning_rate": 0.00017976874579842046,
+ "loss": 0.3484851837158203,
+ "mean_token_accuracy": 0.8916165816783905,
+ "num_tokens": 3725806.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.45332007586956025,
+ "eval_loss": 0.5243425965309143,
+ "eval_mean_token_accuracy": 0.8532214590907097,
+ "eval_num_tokens": 3838972.0,
+ "eval_runtime": 51.3837,
+ "eval_samples_per_second": 31.099,
+ "eval_steps_per_second": 3.892,
+ "step": 1496
+ },
+ {
+ "entropy": 0.39358587043754983,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.27960067987442017,
+ "learning_rate": 0.0001751008032260355,
+ "loss": 0.34616813659667967,
+ "mean_token_accuracy": 0.8923709010234987,
+ "num_tokens": 3849380.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3206369188427925,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.37261858582496643,
+ "learning_rate": 0.00017030618599002818,
+ "loss": 0.2684581565856934,
+ "mean_token_accuracy": 0.9131761506199837,
+ "num_tokens": 3976694.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3254615054279566,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.39803799986839294,
+ "learning_rate": 0.00016539533375763032,
+ "loss": 0.2769618606567383,
+ "mean_token_accuracy": 0.9102409112453461,
+ "num_tokens": 4110204.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.32003962114453316,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.35707366466522217,
+ "learning_rate": 0.0001603789392829468,
+ "loss": 0.2749842834472656,
+ "mean_token_accuracy": 0.910626070201397,
+ "num_tokens": 4240883.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.32672152675688265,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.47052621841430664,
+ "learning_rate": 0.00015526792512484774,
+ "loss": 0.27983531951904295,
+ "mean_token_accuracy": 0.9093958771228791,
+ "num_tokens": 4365381.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.33449163861572745,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.330709844827652,
+ "learning_rate": 0.00015007341986449012,
+ "loss": 0.28533639907836916,
+ "mean_token_accuracy": 0.9082232251763344,
+ "num_tokens": 4490711.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.33353066638112067,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.3990134298801422,
+ "learning_rate": 0.00014480673387425272,
+ "loss": 0.28489078521728517,
+ "mean_token_accuracy": 0.908001911342144,
+ "num_tokens": 4618532.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3272412090748549,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3183020353317261,
+ "learning_rate": 0.00013947933469084315,
+ "loss": 0.2772365379333496,
+ "mean_token_accuracy": 0.908946952521801,
+ "num_tokens": 4752261.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3932068006694317,
+ "eval_loss": 0.5647156834602356,
+ "eval_mean_token_accuracy": 0.85078628718853,
+ "eval_num_tokens": 4798715.0,
+ "eval_runtime": 51.3578,
+ "eval_samples_per_second": 31.115,
+ "eval_steps_per_second": 3.894,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2832252390005372,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.3697633147239685,
+ "learning_rate": 0.00013410282204620014,
+ "loss": 0.2279021453857422,
+ "mean_token_accuracy": 0.9252248072262966,
+ "num_tokens": 4879271.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.250804705247283,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.3890162706375122,
+ "learning_rate": 0.00012868890261055722,
+ "loss": 0.1980854606628418,
+ "mean_token_accuracy": 0.9338876655697823,
+ "num_tokens": 5005076.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.2531572911888361,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.43466225266456604,
+ "learning_rate": 0.0001232493645026623,
+ "loss": 0.20114482879638673,
+ "mean_token_accuracy": 0.9317018255591393,
+ "num_tokens": 5133591.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.25918263107538225,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.38253673911094666,
+ "learning_rate": 0.00011779605162265297,
+ "loss": 0.2056061363220215,
+ "mean_token_accuracy": 0.9302830925583839,
+ "num_tokens": 5257252.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.2553627458959818,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.4536231458187103,
+ "learning_rate": 0.00011234083786347563,
+ "loss": 0.20531394958496094,
+ "mean_token_accuracy": 0.9302299374341965,
+ "num_tokens": 5388652.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.2575570110231638,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.36399731040000916,
+ "learning_rate": 0.00010689560125699833,
+ "loss": 0.2048162841796875,
+ "mean_token_accuracy": 0.9306997761130333,
+ "num_tokens": 5515488.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.24660897620022296,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.43602702021598816,
+ "learning_rate": 0.00010147219811111233,
+ "loss": 0.1986431884765625,
+ "mean_token_accuracy": 0.9335029146075249,
+ "num_tokens": 5644323.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.32280903935432437,
+ "eval_loss": 0.6383674144744873,
+ "eval_mean_token_accuracy": 0.849581449329853,
+ "eval_num_tokens": 5758458.0,
+ "eval_runtime": 51.3251,
+ "eval_samples_per_second": 31.135,
+ "eval_steps_per_second": 3.897,
+ "step": 2244
+ },
+ {
+ "entropy": 0.24638524448329752,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.3538878262042999,
+ "learning_rate": 9.608243719413435e-05,
+ "loss": 0.19203664779663085,
+ "mean_token_accuracy": 0.9353304363862432,
+ "num_tokens": 5773027.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.17814311504364014,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.3886161148548126,
+ "learning_rate": 9.07380540227205e-05,
+ "loss": 0.12442682266235351,
+ "mean_token_accuracy": 0.9582101872563362,
+ "num_tokens": 5904840.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.17237136442214251,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.38807374238967896,
+ "learning_rate": 8.545068530927622e-05,
+ "loss": 0.12445520401000977,
+ "mean_token_accuracy": 0.9580146077275277,
+ "num_tokens": 6037457.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.18334724467247723,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.48334068059921265,
+ "learning_rate": 8.023184362449975e-05,
+ "loss": 0.12853397369384767,
+ "mean_token_accuracy": 0.956232733130455,
+ "num_tokens": 6161042.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.17894859783351422,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3343403935432434,
+ "learning_rate": 7.509289233022861e-05,
+ "loss": 0.12605968475341797,
+ "mean_token_accuracy": 0.9566894540190697,
+ "num_tokens": 6291748.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.17900108266621828,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.41615480184555054,
+ "learning_rate": 7.00450208371691e-05,
+ "loss": 0.12843725204467774,
+ "mean_token_accuracy": 0.956638223528862,
+ "num_tokens": 6419265.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.18341445792466401,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.3722545802593231,
+ "learning_rate": 6.509922024138231e-05,
+ "loss": 0.13251185417175293,
+ "mean_token_accuracy": 0.9549383011460304,
+ "num_tokens": 6544758.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.17789534136652946,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.33068087697029114,
+ "learning_rate": 6.02662593925748e-05,
+ "loss": 0.126302547454834,
+ "mean_token_accuracy": 0.9568320420384407,
+ "num_tokens": 6674626.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.2639452085644007,
+ "eval_loss": 0.7510635852813721,
+ "eval_mean_token_accuracy": 0.8456276795268058,
+ "eval_num_tokens": 6718201.0,
+ "eval_runtime": 51.3435,
+ "eval_samples_per_second": 31.124,
+ "eval_steps_per_second": 3.895,
+ "step": 2618
+ },
+ {
+ "entropy": 0.15149398003187445,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.3362366855144501,
+ "learning_rate": 5.5556661446302733e-05,
+ "loss": 0.09618576049804688,
+ "mean_token_accuracy": 0.9674260706612559,
+ "num_tokens": 6803830.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.1279136904887855,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2925446033477783,
+ "learning_rate": 5.0980680951143166e-05,
+ "loss": 0.07902004718780517,
+ "mean_token_accuracy": 0.9733691918849945,
+ "num_tokens": 6936289.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.13701909594237804,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.2659797668457031,
+ "learning_rate": 4.6548281520723104e-05,
+ "loss": 0.08250561714172364,
+ "mean_token_accuracy": 0.971816695034504,
+ "num_tokens": 7057823.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.12771729078143834,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.34247028827667236,
+ "learning_rate": 4.2269114139222296e-05,
+ "loss": 0.08109721183776855,
+ "mean_token_accuracy": 0.9735026282072067,
+ "num_tokens": 7187020.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.12350119687616826,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.40673893690109253,
+ "learning_rate": 3.8152496147586614e-05,
+ "loss": 0.07707037448883057,
+ "mean_token_accuracy": 0.9743763041496277,
+ "num_tokens": 7323457.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.12642662361264229,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.3064998984336853,
+ "learning_rate": 3.4207390956206875e-05,
+ "loss": 0.07985133647918702,
+ "mean_token_accuracy": 0.9733496251702308,
+ "num_tokens": 7451765.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.1269074559956789,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.24833732843399048,
+ "learning_rate": 3.0442388528236647e-05,
+ "loss": 0.0821513843536377,
+ "mean_token_accuracy": 0.9733479696512223,
+ "num_tokens": 7578222.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2316589906811714,
+ "eval_loss": 0.851987898349762,
+ "eval_mean_token_accuracy": 0.8467304027080536,
+ "eval_num_tokens": 7677944.0,
+ "eval_runtime": 51.3501,
+ "eval_samples_per_second": 31.12,
+ "eval_steps_per_second": 3.895,
+ "step": 2992
+ },
+ {
+ "entropy": 0.13051860011888272,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.13740092515945435,
+ "learning_rate": 2.686568667604363e-05,
+ "loss": 0.08182425498962402,
+ "mean_token_accuracy": 0.9728358243450974,
+ "num_tokens": 7699511.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.11715538412332535,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.22101899981498718,
+ "learning_rate": 2.3485073211519044e-05,
+ "loss": 0.06548665523529053,
+ "mean_token_accuracy": 0.9781955161690712,
+ "num_tokens": 7822731.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.10994385546073318,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.18131813406944275,
+ "learning_rate": 2.0307908989111124e-05,
+ "loss": 0.06045622825622558,
+ "mean_token_accuracy": 0.9784942081570626,
+ "num_tokens": 7954237.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.10547435775399208,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.219278946518898,
+ "learning_rate": 1.734111187850385e-05,
+ "loss": 0.06194626808166504,
+ "mean_token_accuracy": 0.979062694311142,
+ "num_tokens": 8082999.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.10694314314052462,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.21804827451705933,
+ "learning_rate": 1.4591141701838324e-05,
+ "loss": 0.06162384033203125,
+ "mean_token_accuracy": 0.9781702619791031,
+ "num_tokens": 8214329.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.10566199742257595,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.12687279284000397,
+ "learning_rate": 1.2063986168274383e-05,
+ "loss": 0.06172010898590088,
+ "mean_token_accuracy": 0.9794953766465188,
+ "num_tokens": 8345134.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.11023524977266788,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.14195536077022552,
+ "learning_rate": 9.765147836518029e-06,
+ "loss": 0.06327592372894288,
+ "mean_token_accuracy": 0.9780235534906387,
+ "num_tokens": 8470788.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.11061601843684912,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.17863383889198303,
+ "learning_rate": 7.699632133701809e-06,
+ "loss": 0.0625003719329834,
+ "mean_token_accuracy": 0.9776859974861145,
+ "num_tokens": 8599154.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2094542646408081,
+ "eval_loss": 0.9543951153755188,
+ "eval_mean_token_accuracy": 0.8449241068959236,
+ "eval_num_tokens": 8637687.0,
+ "eval_runtime": 51.3693,
+ "eval_samples_per_second": 31.108,
+ "eval_steps_per_second": 3.893,
+ "step": 3366
+ },
+ {
+ "entropy": 0.11372084063336704,
+ "epoch": 9.09103078982597,
+ "grad_norm": 0.18322895467281342,
+ "learning_rate": 5.871936456706078e-06,
+ "loss": 0.061551513671875,
+ "mean_token_accuracy": 0.9781497656696975,
+ "num_tokens": 8718657.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.1057378869690001,
+ "epoch": 9.224899598393574,
+ "grad_norm": 0.1794043332338333,
+ "learning_rate": 4.286040379651099e-06,
+ "loss": 0.05919742107391358,
+ "mean_token_accuracy": 0.979441005885601,
+ "num_tokens": 8841327.0,
+ "step": 3450
+ },
+ {
+ "entropy": 0.10552630050107836,
+ "epoch": 9.358768406961179,
+ "grad_norm": 0.2005266696214676,
+ "learning_rate": 2.945396988882265e-06,
+ "loss": 0.05768038272857666,
+ "mean_token_accuracy": 0.9795372131466865,
+ "num_tokens": 8968182.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.0980272913351655,
+ "epoch": 9.492637215528783,
+ "grad_norm": 0.22345173358917236,
+ "learning_rate": 1.8529253643150706e-06,
+ "loss": 0.05448314189910888,
+ "mean_token_accuracy": 0.9807884976267814,
+ "num_tokens": 9103318.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.10432180495932698,
+ "epoch": 9.626506024096386,
+ "grad_norm": 0.1691906750202179,
+ "learning_rate": 1.0110042235111828e-06,
+ "loss": 0.05666207790374756,
+ "mean_token_accuracy": 0.9797532597184181,
+ "num_tokens": 9233233.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.10123609615489841,
+ "epoch": 9.76037483266399,
+ "grad_norm": 0.21184755861759186,
+ "learning_rate": 4.214667423244783e-07,
+ "loss": 0.056021313667297366,
+ "mean_token_accuracy": 0.9803864064812661,
+ "num_tokens": 9362807.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.09557499976828694,
+ "epoch": 9.894243641231594,
+ "grad_norm": 0.19447794556617737,
+ "learning_rate": 8.559656339447186e-08,
+ "loss": 0.05355457782745361,
+ "mean_token_accuracy": 0.9813841906189918,
+ "num_tokens": 9500858.0,
+ "step": 3700
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.20286250963807106,
+ "eval_loss": 0.9991143941879272,
+ "eval_mean_token_accuracy": 0.8458420696854592,
+ "eval_num_tokens": 9597430.0,
+ "eval_runtime": 51.3552,
+ "eval_samples_per_second": 31.117,
+ "eval_steps_per_second": 3.894,
+ "step": 3740
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.115447797653412e+17,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8818eeae60314bb5e6cd8bb9d31975488784721f
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.0094403300459725,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "q_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "k_proj",
+ "up_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9bf2568583ede88c8066a63b851d90d79cbbdff
--- /dev/null
+++ b/DBCA_code_Swedish/Qwen3-4B-Base_code_features_structural_train_code_features_structural_test2/checkpoint-748/trainer_state.json
@@ -0,0 +1,196 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 748,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.2357245934009553,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1940524578094482,
+ "learning_rate": 3.0637587087867373e-05,
+ "loss": 2.0509376525878906,
+ "mean_token_accuracy": 0.6194104523956776,
+ "num_tokens": 127704.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8588631230592728,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6621095538139343,
+ "learning_rate": 6.190043105507899e-05,
+ "loss": 0.8043325805664062,
+ "mean_token_accuracy": 0.7855384379625321,
+ "num_tokens": 256077.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6726470375061036,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5255736112594604,
+ "learning_rate": 9.316327502229059e-05,
+ "loss": 0.6415129089355469,
+ "mean_token_accuracy": 0.8183896672725678,
+ "num_tokens": 387735.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6119109469652176,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45975860953330994,
+ "learning_rate": 0.0001244261189895022,
+ "loss": 0.576114501953125,
+ "mean_token_accuracy": 0.8376823288202285,
+ "num_tokens": 522202.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5972725109755993,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.40055611729621887,
+ "learning_rate": 0.0001556889629567138,
+ "loss": 0.5613529205322265,
+ "mean_token_accuracy": 0.8418879929184914,
+ "num_tokens": 648663.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5673307004570961,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.7427454590797424,
+ "learning_rate": 0.0001869518069239254,
+ "loss": 0.5325925827026368,
+ "mean_token_accuracy": 0.8488189685344696,
+ "num_tokens": 778245.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5615521620213986,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.4017025828361511,
+ "learning_rate": 0.000218214650891137,
+ "loss": 0.522266616821289,
+ "mean_token_accuracy": 0.8500416606664658,
+ "num_tokens": 905328.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5758187392354012,
+ "eval_loss": 0.5664522051811218,
+ "eval_mean_token_accuracy": 0.8383084440231323,
+ "eval_num_tokens": 959743.0,
+ "eval_runtime": 51.7964,
+ "eval_samples_per_second": 30.852,
+ "eval_steps_per_second": 3.861,
+ "step": 374
+ },
+ {
+ "entropy": 0.5440343178883947,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.3084093928337097,
+ "learning_rate": 0.00023381424541885068,
+ "loss": 0.5051319122314453,
+ "mean_token_accuracy": 0.8554374280602041,
+ "num_tokens": 1022581.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5245272906124592,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3342743515968323,
+ "learning_rate": 0.00023355972972676628,
+ "loss": 0.4855318450927734,
+ "mean_token_accuracy": 0.8584430786967278,
+ "num_tokens": 1151430.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5234513898193837,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.26953577995300293,
+ "learning_rate": 0.00023305125251804043,
+ "loss": 0.4811768341064453,
+ "mean_token_accuracy": 0.8606387570500373,
+ "num_tokens": 1282481.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5147616830468178,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.28791534900665283,
+ "learning_rate": 0.0002322899209369128,
+ "loss": 0.4762062835693359,
+ "mean_token_accuracy": 0.8612849581241607,
+ "num_tokens": 1414166.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5060296922922134,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.2496163249015808,
+ "learning_rate": 0.0002312773926857543,
+ "loss": 0.4695176315307617,
+ "mean_token_accuracy": 0.8636157616972924,
+ "num_tokens": 1547902.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5008939932286739,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2329542636871338,
+ "learning_rate": 0.00023001587241563198,
+ "loss": 0.46317913055419924,
+ "mean_token_accuracy": 0.8652430367469788,
+ "num_tokens": 1678635.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.4986082436144352,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2459402084350586,
+ "learning_rate": 0.00022850810692596235,
+ "loss": 0.4617066192626953,
+ "mean_token_accuracy": 0.8672981086373329,
+ "num_tokens": 1803328.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.546445109397173,
+ "eval_loss": 0.5307288765907288,
+ "eval_mean_token_accuracy": 0.8434162598848343,
+ "eval_num_tokens": 1919486.0,
+ "eval_runtime": 51.3627,
+ "eval_samples_per_second": 31.112,
+ "eval_steps_per_second": 3.894,
+ "step": 748
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 8.21288517750313e+16,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..201b3d4c1f1e8b143370f8952a59e885165126f5
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/mjvftsw2)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0fc99cd862def174d805ce466eee3156d96b716
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1155/trainer_state.json
@@ -0,0 +1,297 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 1155,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.637823526955428e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d2eb3b7496b4e7b927decbc8ca5bae01d559379
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1540/trainer_state.json
@@ -0,0 +1,378 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 1540,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.1769399455551386e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..73ba7a105468339427e9f7fa854426093084ee1a
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1925/trainer_state.json
@@ -0,0 +1,469 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 1925,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.723982041528156e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..905899b993df934967865bd125d25b1fa1402b2e
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2310/trainer_state.json
@@ -0,0 +1,560 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 2310,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ },
+ {
+ "entropy": 0.4371735429763794,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.5684086680412292,
+ "learning_rate": 6.323665657271966e-05,
+ "loss": 0.3749085998535156,
+ "mean_token_accuracy": 0.8815305006504058,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.40612608641386033,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.6515666842460632,
+ "learning_rate": 6.076981060656787e-05,
+ "loss": 0.33952392578125,
+ "mean_token_accuracy": 0.8902835595607758,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.41485957682132724,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.5700598359107971,
+ "learning_rate": 5.829087156321799e-05,
+ "loss": 0.345616455078125,
+ "mean_token_accuracy": 0.8897144883871079,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.4269444864988327,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.6670826077461243,
+ "learning_rate": 5.580493304160404e-05,
+ "loss": 0.35833843231201173,
+ "mean_token_accuracy": 0.8866234600543976,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.41144982814788816,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.620884120464325,
+ "learning_rate": 5.331710302283492e-05,
+ "loss": 0.3445538330078125,
+ "mean_token_accuracy": 0.8895936322212219,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.41389220267534255,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.6487388610839844,
+ "learning_rate": 5.0832493374572605e-05,
+ "loss": 0.34858001708984376,
+ "mean_token_accuracy": 0.8874113804101944,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.41483602195978164,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.5946773290634155,
+ "learning_rate": 4.835620934742408e-05,
+ "loss": 0.3495229721069336,
+ "mean_token_accuracy": 0.8887655180692673,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.4204313641786575,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.6658430099487305,
+ "learning_rate": 4.589333908492996e-05,
+ "loss": 0.3538378143310547,
+ "mean_token_accuracy": 0.8866806083917618,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4912281467650945,
+ "eval_loss": 0.7537463903427124,
+ "eval_mean_token_accuracy": 0.8104288898981534,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 48.0083,
+ "eval_samples_per_second": 34.515,
+ "eval_steps_per_second": 4.333,
+ "step": 2310
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.26902097163009e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce9b160588db7f10e1f9cc8177b12783be16ecc1
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2695/trainer_state.json
@@ -0,0 +1,641 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.0,
+ "eval_steps": 500,
+ "global_step": 2695,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ },
+ {
+ "entropy": 0.4371735429763794,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.5684086680412292,
+ "learning_rate": 6.323665657271966e-05,
+ "loss": 0.3749085998535156,
+ "mean_token_accuracy": 0.8815305006504058,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.40612608641386033,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.6515666842460632,
+ "learning_rate": 6.076981060656787e-05,
+ "loss": 0.33952392578125,
+ "mean_token_accuracy": 0.8902835595607758,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.41485957682132724,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.5700598359107971,
+ "learning_rate": 5.829087156321799e-05,
+ "loss": 0.345616455078125,
+ "mean_token_accuracy": 0.8897144883871079,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.4269444864988327,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.6670826077461243,
+ "learning_rate": 5.580493304160404e-05,
+ "loss": 0.35833843231201173,
+ "mean_token_accuracy": 0.8866234600543976,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.41144982814788816,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.620884120464325,
+ "learning_rate": 5.331710302283492e-05,
+ "loss": 0.3445538330078125,
+ "mean_token_accuracy": 0.8895936322212219,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.41389220267534255,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.6487388610839844,
+ "learning_rate": 5.0832493374572605e-05,
+ "loss": 0.34858001708984376,
+ "mean_token_accuracy": 0.8874113804101944,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.41483602195978164,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.5946773290634155,
+ "learning_rate": 4.835620934742408e-05,
+ "loss": 0.3495229721069336,
+ "mean_token_accuracy": 0.8887655180692673,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.4204313641786575,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.6658430099487305,
+ "learning_rate": 4.589333908492996e-05,
+ "loss": 0.3538378143310547,
+ "mean_token_accuracy": 0.8866806083917618,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4912281467650945,
+ "eval_loss": 0.7537463903427124,
+ "eval_mean_token_accuracy": 0.8104288898981534,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 48.0083,
+ "eval_samples_per_second": 34.515,
+ "eval_steps_per_second": 4.333,
+ "step": 2310
+ },
+ {
+ "entropy": 0.3666571286320686,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.6230902671813965,
+ "learning_rate": 4.344894316870371e-05,
+ "loss": 0.2813127517700195,
+ "mean_token_accuracy": 0.9077165073156357,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.3391442158818245,
+ "epoch": 6.233766233766234,
+ "grad_norm": 0.629700779914856,
+ "learning_rate": 4.1028044220203685e-05,
+ "loss": 0.26457656860351564,
+ "mean_token_accuracy": 0.9139899307489395,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34152675241231917,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.711188793182373,
+ "learning_rate": 3.863561658050396e-05,
+ "loss": 0.26950265884399416,
+ "mean_token_accuracy": 0.9120226174592971,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.33951902255415917,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.7751043438911438,
+ "learning_rate": 3.627657608926905e-05,
+ "loss": 0.26502132415771484,
+ "mean_token_accuracy": 0.9131791013479232,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.3548544436693192,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.9152925610542297,
+ "learning_rate": 3.395576998393457e-05,
+ "loss": 0.27833885192871094,
+ "mean_token_accuracy": 0.9090453034639359,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.3550854653120041,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.8495270013809204,
+ "learning_rate": 3.167796693984804e-05,
+ "loss": 0.27818309783935546,
+ "mean_token_accuracy": 0.9102006632089615,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.34031844735145567,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.6764019727706909,
+ "learning_rate": 2.9447847271835456e-05,
+ "loss": 0.26656494140625,
+ "mean_token_accuracy": 0.9126953399181366,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.4410387526911039,
+ "eval_loss": 0.8089802265167236,
+ "eval_mean_token_accuracy": 0.80925900173875,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 47.9562,
+ "eval_samples_per_second": 34.552,
+ "eval_steps_per_second": 4.337,
+ "step": 2695
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.810735902601032e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f469d9d84585a86efbdc7dcb7d9ed68565c3eb78
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3080/trainer_state.json
@@ -0,0 +1,732 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 500,
+ "global_step": 3080,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ },
+ {
+ "entropy": 0.4371735429763794,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.5684086680412292,
+ "learning_rate": 6.323665657271966e-05,
+ "loss": 0.3749085998535156,
+ "mean_token_accuracy": 0.8815305006504058,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.40612608641386033,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.6515666842460632,
+ "learning_rate": 6.076981060656787e-05,
+ "loss": 0.33952392578125,
+ "mean_token_accuracy": 0.8902835595607758,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.41485957682132724,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.5700598359107971,
+ "learning_rate": 5.829087156321799e-05,
+ "loss": 0.345616455078125,
+ "mean_token_accuracy": 0.8897144883871079,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.4269444864988327,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.6670826077461243,
+ "learning_rate": 5.580493304160404e-05,
+ "loss": 0.35833843231201173,
+ "mean_token_accuracy": 0.8866234600543976,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.41144982814788816,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.620884120464325,
+ "learning_rate": 5.331710302283492e-05,
+ "loss": 0.3445538330078125,
+ "mean_token_accuracy": 0.8895936322212219,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.41389220267534255,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.6487388610839844,
+ "learning_rate": 5.0832493374572605e-05,
+ "loss": 0.34858001708984376,
+ "mean_token_accuracy": 0.8874113804101944,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.41483602195978164,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.5946773290634155,
+ "learning_rate": 4.835620934742408e-05,
+ "loss": 0.3495229721069336,
+ "mean_token_accuracy": 0.8887655180692673,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.4204313641786575,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.6658430099487305,
+ "learning_rate": 4.589333908492996e-05,
+ "loss": 0.3538378143310547,
+ "mean_token_accuracy": 0.8866806083917618,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4912281467650945,
+ "eval_loss": 0.7537463903427124,
+ "eval_mean_token_accuracy": 0.8104288898981534,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 48.0083,
+ "eval_samples_per_second": 34.515,
+ "eval_steps_per_second": 4.333,
+ "step": 2310
+ },
+ {
+ "entropy": 0.3666571286320686,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.6230902671813965,
+ "learning_rate": 4.344894316870371e-05,
+ "loss": 0.2813127517700195,
+ "mean_token_accuracy": 0.9077165073156357,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.3391442158818245,
+ "epoch": 6.233766233766234,
+ "grad_norm": 0.629700779914856,
+ "learning_rate": 4.1028044220203685e-05,
+ "loss": 0.26457656860351564,
+ "mean_token_accuracy": 0.9139899307489395,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34152675241231917,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.711188793182373,
+ "learning_rate": 3.863561658050396e-05,
+ "loss": 0.26950265884399416,
+ "mean_token_accuracy": 0.9120226174592971,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.33951902255415917,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.7751043438911438,
+ "learning_rate": 3.627657608926905e-05,
+ "loss": 0.26502132415771484,
+ "mean_token_accuracy": 0.9131791013479232,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.3548544436693192,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.9152925610542297,
+ "learning_rate": 3.395576998393457e-05,
+ "loss": 0.27833885192871094,
+ "mean_token_accuracy": 0.9090453034639359,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.3550854653120041,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.8495270013809204,
+ "learning_rate": 3.167796693984804e-05,
+ "loss": 0.27818309783935546,
+ "mean_token_accuracy": 0.9102006632089615,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.34031844735145567,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.6764019727706909,
+ "learning_rate": 2.9447847271835456e-05,
+ "loss": 0.26656494140625,
+ "mean_token_accuracy": 0.9126953399181366,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.4410387526911039,
+ "eval_loss": 0.8089802265167236,
+ "eval_mean_token_accuracy": 0.80925900173875,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 47.9562,
+ "eval_samples_per_second": 34.552,
+ "eval_steps_per_second": 4.337,
+ "step": 2695
+ },
+ {
+ "entropy": 0.335344740152359,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.6364794373512268,
+ "learning_rate": 2.7269993317326242e-05,
+ "loss": 0.25932022094726564,
+ "mean_token_accuracy": 0.9156477284431458,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.283698299229145,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.9388208389282227,
+ "learning_rate": 2.514888002079755e-05,
+ "loss": 0.19749004364013673,
+ "mean_token_accuracy": 0.9353394263982773,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.27516734033823015,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.706503689289093,
+ "learning_rate": 2.3088865738883814e-05,
+ "loss": 0.19110334396362305,
+ "mean_token_accuracy": 0.9379108762741089,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.273246209025383,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.7857301235198975,
+ "learning_rate": 2.1094183285045552e-05,
+ "loss": 0.19094297409057617,
+ "mean_token_accuracy": 0.9369927847385406,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.2801500430703163,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.7793248891830444,
+ "learning_rate": 1.9168931232197576e-05,
+ "loss": 0.19656993865966796,
+ "mean_token_accuracy": 0.9351688891649246,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.27495736733078957,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.8637392520904541,
+ "learning_rate": 1.7317065491168085e-05,
+ "loss": 0.1936025810241699,
+ "mean_token_accuracy": 0.9363743001222611,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.2810053497552872,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.7629940509796143,
+ "learning_rate": 1.554239118229261e-05,
+ "loss": 0.1976767921447754,
+ "mean_token_accuracy": 0.9348012053966522,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.272479218095541,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.8325297832489014,
+ "learning_rate": 1.3848554816844692e-05,
+ "loss": 0.1889443016052246,
+ "mean_token_accuracy": 0.9384464406967163,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.4007443663879083,
+ "eval_loss": 0.8944177031517029,
+ "eval_mean_token_accuracy": 0.8053252046497968,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 47.9727,
+ "eval_samples_per_second": 34.54,
+ "eval_steps_per_second": 4.336,
+ "step": 3080
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.3542904203667354e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3513d449f09295b638b1af75e4254ea4b8f2ace5
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3465/trainer_state.json
@@ -0,0 +1,823 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3465,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ },
+ {
+ "entropy": 0.4371735429763794,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.5684086680412292,
+ "learning_rate": 6.323665657271966e-05,
+ "loss": 0.3749085998535156,
+ "mean_token_accuracy": 0.8815305006504058,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.40612608641386033,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.6515666842460632,
+ "learning_rate": 6.076981060656787e-05,
+ "loss": 0.33952392578125,
+ "mean_token_accuracy": 0.8902835595607758,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.41485957682132724,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.5700598359107971,
+ "learning_rate": 5.829087156321799e-05,
+ "loss": 0.345616455078125,
+ "mean_token_accuracy": 0.8897144883871079,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.4269444864988327,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.6670826077461243,
+ "learning_rate": 5.580493304160404e-05,
+ "loss": 0.35833843231201173,
+ "mean_token_accuracy": 0.8866234600543976,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.41144982814788816,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.620884120464325,
+ "learning_rate": 5.331710302283492e-05,
+ "loss": 0.3445538330078125,
+ "mean_token_accuracy": 0.8895936322212219,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.41389220267534255,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.6487388610839844,
+ "learning_rate": 5.0832493374572605e-05,
+ "loss": 0.34858001708984376,
+ "mean_token_accuracy": 0.8874113804101944,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.41483602195978164,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.5946773290634155,
+ "learning_rate": 4.835620934742408e-05,
+ "loss": 0.3495229721069336,
+ "mean_token_accuracy": 0.8887655180692673,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.4204313641786575,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.6658430099487305,
+ "learning_rate": 4.589333908492996e-05,
+ "loss": 0.3538378143310547,
+ "mean_token_accuracy": 0.8866806083917618,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4912281467650945,
+ "eval_loss": 0.7537463903427124,
+ "eval_mean_token_accuracy": 0.8104288898981534,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 48.0083,
+ "eval_samples_per_second": 34.515,
+ "eval_steps_per_second": 4.333,
+ "step": 2310
+ },
+ {
+ "entropy": 0.3666571286320686,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.6230902671813965,
+ "learning_rate": 4.344894316870371e-05,
+ "loss": 0.2813127517700195,
+ "mean_token_accuracy": 0.9077165073156357,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.3391442158818245,
+ "epoch": 6.233766233766234,
+ "grad_norm": 0.629700779914856,
+ "learning_rate": 4.1028044220203685e-05,
+ "loss": 0.26457656860351564,
+ "mean_token_accuracy": 0.9139899307489395,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34152675241231917,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.711188793182373,
+ "learning_rate": 3.863561658050396e-05,
+ "loss": 0.26950265884399416,
+ "mean_token_accuracy": 0.9120226174592971,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.33951902255415917,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.7751043438911438,
+ "learning_rate": 3.627657608926905e-05,
+ "loss": 0.26502132415771484,
+ "mean_token_accuracy": 0.9131791013479232,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.3548544436693192,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.9152925610542297,
+ "learning_rate": 3.395576998393457e-05,
+ "loss": 0.27833885192871094,
+ "mean_token_accuracy": 0.9090453034639359,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.3550854653120041,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.8495270013809204,
+ "learning_rate": 3.167796693984804e-05,
+ "loss": 0.27818309783935546,
+ "mean_token_accuracy": 0.9102006632089615,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.34031844735145567,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.6764019727706909,
+ "learning_rate": 2.9447847271835456e-05,
+ "loss": 0.26656494140625,
+ "mean_token_accuracy": 0.9126953399181366,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.4410387526911039,
+ "eval_loss": 0.8089802265167236,
+ "eval_mean_token_accuracy": 0.80925900173875,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 47.9562,
+ "eval_samples_per_second": 34.552,
+ "eval_steps_per_second": 4.337,
+ "step": 2695
+ },
+ {
+ "entropy": 0.335344740152359,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.6364794373512268,
+ "learning_rate": 2.7269993317326242e-05,
+ "loss": 0.25932022094726564,
+ "mean_token_accuracy": 0.9156477284431458,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.283698299229145,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.9388208389282227,
+ "learning_rate": 2.514888002079755e-05,
+ "loss": 0.19749004364013673,
+ "mean_token_accuracy": 0.9353394263982773,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.27516734033823015,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.706503689289093,
+ "learning_rate": 2.3088865738883814e-05,
+ "loss": 0.19110334396362305,
+ "mean_token_accuracy": 0.9379108762741089,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.273246209025383,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.7857301235198975,
+ "learning_rate": 2.1094183285045552e-05,
+ "loss": 0.19094297409057617,
+ "mean_token_accuracy": 0.9369927847385406,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.2801500430703163,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.7793248891830444,
+ "learning_rate": 1.9168931232197576e-05,
+ "loss": 0.19656993865966796,
+ "mean_token_accuracy": 0.9351688891649246,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.27495736733078957,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.8637392520904541,
+ "learning_rate": 1.7317065491168085e-05,
+ "loss": 0.1936025810241699,
+ "mean_token_accuracy": 0.9363743001222611,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.2810053497552872,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.7629940509796143,
+ "learning_rate": 1.554239118229261e-05,
+ "loss": 0.1976767921447754,
+ "mean_token_accuracy": 0.9348012053966522,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.272479218095541,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.8325297832489014,
+ "learning_rate": 1.3848554816844692e-05,
+ "loss": 0.1889443016052246,
+ "mean_token_accuracy": 0.9384464406967163,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.4007443663879083,
+ "eval_loss": 0.8944177031517029,
+ "eval_mean_token_accuracy": 0.8053252046497968,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 47.9727,
+ "eval_samples_per_second": 34.54,
+ "eval_steps_per_second": 4.336,
+ "step": 3080
+ },
+ {
+ "entropy": 0.2529501300305128,
+ "epoch": 8.051948051948052,
+ "grad_norm": 0.7490427494049072,
+ "learning_rate": 1.2239036804368287e-05,
+ "loss": 0.1695658302307129,
+ "mean_token_accuracy": 0.9450542360544205,
+ "num_tokens": 7903348.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.2297460925579071,
+ "epoch": 8.181818181818182,
+ "grad_norm": 0.7783088088035583,
+ "learning_rate": 1.0717144301307847e-05,
+ "loss": 0.13803850173950194,
+ "mean_token_accuracy": 0.9550822985172271,
+ "num_tokens": 8028778.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.230171779692173,
+ "epoch": 8.311688311688311,
+ "grad_norm": 0.8110005259513855,
+ "learning_rate": 9.286004415629994e-06,
+ "loss": 0.1422537612915039,
+ "mean_token_accuracy": 0.9539634013175964,
+ "num_tokens": 8153699.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.22410715252161026,
+ "epoch": 8.441558441558442,
+ "grad_norm": 0.6850584745407104,
+ "learning_rate": 7.948557781399818e-06,
+ "loss": 0.13708532333374024,
+ "mean_token_accuracy": 0.9560226953029632,
+ "num_tokens": 8281408.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.23170628443360328,
+ "epoch": 8.571428571428571,
+ "grad_norm": 0.7979664206504822,
+ "learning_rate": 6.707552516514227e-06,
+ "loss": 0.14379706382751464,
+ "mean_token_accuracy": 0.9544130796194077,
+ "num_tokens": 8400657.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2197262801229954,
+ "epoch": 8.7012987012987,
+ "grad_norm": 0.6074482202529907,
+ "learning_rate": 5.565538576007922e-06,
+ "loss": 0.13595272064208985,
+ "mean_token_accuracy": 0.9567223310470581,
+ "num_tokens": 8533080.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.2189832380414009,
+ "epoch": 8.831168831168831,
+ "grad_norm": 0.7105876207351685,
+ "learning_rate": 4.5248625125343745e-06,
+ "loss": 0.13385194778442383,
+ "mean_token_accuracy": 0.9567512100934983,
+ "num_tokens": 8664922.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.2234116178750992,
+ "epoch": 8.96103896103896,
+ "grad_norm": 0.7933406829833984,
+ "learning_rate": 3.587662654787801e-06,
+ "loss": 0.13944730758666993,
+ "mean_token_accuracy": 0.9551076376438141,
+ "num_tokens": 8797714.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.36579762743069577,
+ "eval_loss": 0.9979091286659241,
+ "eval_mean_token_accuracy": 0.8024594216392591,
+ "eval_num_tokens": 8833950.0,
+ "eval_runtime": 47.9914,
+ "eval_samples_per_second": 34.527,
+ "eval_steps_per_second": 4.334,
+ "step": 3465
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.8981730000368845e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..029a90a5d5388bca1d16b8579c3ca64441da31fb
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-385/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 385,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.429451631126118e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..263ec3a0d7823b7aadd413fb360c3e1765bbabc3
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3850/trainer_state.json
@@ -0,0 +1,914 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3850,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ },
+ {
+ "entropy": 0.6272834652662277,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.5319918990135193,
+ "learning_rate": 0.00010594749959349313,
+ "loss": 0.5719264221191406,
+ "mean_token_accuracy": 0.8345229256153107,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.599158108830452,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.4296111762523651,
+ "learning_rate": 0.00010498320200520744,
+ "loss": 0.5460208129882812,
+ "mean_token_accuracy": 0.8396762716770172,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5994478952884674,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.3820176422595978,
+ "learning_rate": 0.00010391596389274791,
+ "loss": 0.5483282852172852,
+ "mean_token_accuracy": 0.8385387778282165,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5989043036103249,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.41718003153800964,
+ "learning_rate": 0.00010274797816316749,
+ "loss": 0.543673095703125,
+ "mean_token_accuracy": 0.8396250855922699,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.6005555561184883,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.4552708864212036,
+ "learning_rate": 0.00010148164473464206,
+ "loss": 0.5505282974243164,
+ "mean_token_accuracy": 0.8371219438314438,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5972397547960281,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.45796939730644226,
+ "learning_rate": 0.00010011956560523972,
+ "loss": 0.5410661697387695,
+ "mean_token_accuracy": 0.8398081564903259,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5875487339496612,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.37569308280944824,
+ "learning_rate": 9.866453950646624e-05,
+ "loss": 0.537100830078125,
+ "mean_token_accuracy": 0.8407874500751495,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.6040622130036354,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.4184563457965851,
+ "learning_rate": 9.711955615257278e-05,
+ "loss": 0.5466165924072266,
+ "mean_token_accuracy": 0.8411829793453216,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6296488522337034,
+ "eval_loss": 0.671941876411438,
+ "eval_mean_token_accuracy": 0.8121863231062889,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 47.9946,
+ "eval_samples_per_second": 34.525,
+ "eval_steps_per_second": 4.334,
+ "step": 1155
+ },
+ {
+ "entropy": 0.543001911342144,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.39842459559440613,
+ "learning_rate": 9.548779009744178e-05,
+ "loss": 0.4850382995605469,
+ "mean_token_accuracy": 0.8531740349531174,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5417884975671768,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.529087245464325,
+ "learning_rate": 9.37725942116738e-05,
+ "loss": 0.48398651123046876,
+ "mean_token_accuracy": 0.8532969230413436,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5348580208420753,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.4483303129673004,
+ "learning_rate": 9.197749279327802e-05,
+ "loss": 0.4842509078979492,
+ "mean_token_accuracy": 0.8533288407325744,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.5422027057409287,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.448234885931015,
+ "learning_rate": 9.010617432612243e-05,
+ "loss": 0.48615737915039064,
+ "mean_token_accuracy": 0.8517620968818664,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.5518654137849808,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.5707411170005798,
+ "learning_rate": 8.816248390102322e-05,
+ "loss": 0.4946014404296875,
+ "mean_token_accuracy": 0.8510974669456481,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.5268254142999649,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.42708849906921387,
+ "learning_rate": 8.615041531504609e-05,
+ "loss": 0.474882926940918,
+ "mean_token_accuracy": 0.8555294382572174,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5430487287044525,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.5479393005371094,
+ "learning_rate": 8.407410286525337e-05,
+ "loss": 0.48806171417236327,
+ "mean_token_accuracy": 0.8523763221502304,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.6002918778417202,
+ "eval_loss": 0.6801126003265381,
+ "eval_mean_token_accuracy": 0.8130095520844827,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 47.9844,
+ "eval_samples_per_second": 34.532,
+ "eval_steps_per_second": 4.335,
+ "step": 1540
+ },
+ {
+ "entropy": 0.5307527387142181,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.5444459915161133,
+ "learning_rate": 8.193781285375899e-05,
+ "loss": 0.4741718292236328,
+ "mean_token_accuracy": 0.8564361107349395,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.47564762085676193,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.6210038065910339,
+ "learning_rate": 7.974593482154601e-05,
+ "loss": 0.41448020935058594,
+ "mean_token_accuracy": 0.8706729990243912,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4731892004609108,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.4910389482975006,
+ "learning_rate": 7.750297252905916e-05,
+ "loss": 0.4135689163208008,
+ "mean_token_accuracy": 0.8708927237987518,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.47941224902868274,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.4900703430175781,
+ "learning_rate": 7.521353470210501e-05,
+ "loss": 0.4219230270385742,
+ "mean_token_accuracy": 0.8685608941316605,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.48145264506340024,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.4503515064716339,
+ "learning_rate": 7.288232556207461e-05,
+ "loss": 0.4248290252685547,
+ "mean_token_accuracy": 0.8680109107494354,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.47785325974226,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.5630834698677063,
+ "learning_rate": 7.051413515994661e-05,
+ "loss": 0.4244534683227539,
+ "mean_token_accuracy": 0.8692984575033188,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4738149631023407,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.5809922814369202,
+ "learning_rate": 6.811382953393207e-05,
+ "loss": 0.41768589019775393,
+ "mean_token_accuracy": 0.870254020690918,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.47820780247449873,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.5658873915672302,
+ "learning_rate": 6.56863407109845e-05,
+ "loss": 0.42182437896728514,
+ "mean_token_accuracy": 0.8680490332841874,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5540635969776374,
+ "eval_loss": 0.6924836039543152,
+ "eval_mean_token_accuracy": 0.8140181821699326,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 47.9926,
+ "eval_samples_per_second": 34.526,
+ "eval_steps_per_second": 4.334,
+ "step": 1925
+ },
+ {
+ "entropy": 0.4371735429763794,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.5684086680412292,
+ "learning_rate": 6.323665657271966e-05,
+ "loss": 0.3749085998535156,
+ "mean_token_accuracy": 0.8815305006504058,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.40612608641386033,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.6515666842460632,
+ "learning_rate": 6.076981060656787e-05,
+ "loss": 0.33952392578125,
+ "mean_token_accuracy": 0.8902835595607758,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.41485957682132724,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.5700598359107971,
+ "learning_rate": 5.829087156321799e-05,
+ "loss": 0.345616455078125,
+ "mean_token_accuracy": 0.8897144883871079,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.4269444864988327,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.6670826077461243,
+ "learning_rate": 5.580493304160404e-05,
+ "loss": 0.35833843231201173,
+ "mean_token_accuracy": 0.8866234600543976,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.41144982814788816,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.620884120464325,
+ "learning_rate": 5.331710302283492e-05,
+ "loss": 0.3445538330078125,
+ "mean_token_accuracy": 0.8895936322212219,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.41389220267534255,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.6487388610839844,
+ "learning_rate": 5.0832493374572605e-05,
+ "loss": 0.34858001708984376,
+ "mean_token_accuracy": 0.8874113804101944,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.41483602195978164,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.5946773290634155,
+ "learning_rate": 4.835620934742408e-05,
+ "loss": 0.3495229721069336,
+ "mean_token_accuracy": 0.8887655180692673,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.4204313641786575,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.6658430099487305,
+ "learning_rate": 4.589333908492996e-05,
+ "loss": 0.3538378143310547,
+ "mean_token_accuracy": 0.8866806083917618,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4912281467650945,
+ "eval_loss": 0.7537463903427124,
+ "eval_mean_token_accuracy": 0.8104288898981534,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 48.0083,
+ "eval_samples_per_second": 34.515,
+ "eval_steps_per_second": 4.333,
+ "step": 2310
+ },
+ {
+ "entropy": 0.3666571286320686,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.6230902671813965,
+ "learning_rate": 4.344894316870371e-05,
+ "loss": 0.2813127517700195,
+ "mean_token_accuracy": 0.9077165073156357,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.3391442158818245,
+ "epoch": 6.233766233766234,
+ "grad_norm": 0.629700779914856,
+ "learning_rate": 4.1028044220203685e-05,
+ "loss": 0.26457656860351564,
+ "mean_token_accuracy": 0.9139899307489395,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34152675241231917,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.711188793182373,
+ "learning_rate": 3.863561658050396e-05,
+ "loss": 0.26950265884399416,
+ "mean_token_accuracy": 0.9120226174592971,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.33951902255415917,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.7751043438911438,
+ "learning_rate": 3.627657608926905e-05,
+ "loss": 0.26502132415771484,
+ "mean_token_accuracy": 0.9131791013479232,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.3548544436693192,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.9152925610542297,
+ "learning_rate": 3.395576998393457e-05,
+ "loss": 0.27833885192871094,
+ "mean_token_accuracy": 0.9090453034639359,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.3550854653120041,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.8495270013809204,
+ "learning_rate": 3.167796693984804e-05,
+ "loss": 0.27818309783935546,
+ "mean_token_accuracy": 0.9102006632089615,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.34031844735145567,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.6764019727706909,
+ "learning_rate": 2.9447847271835456e-05,
+ "loss": 0.26656494140625,
+ "mean_token_accuracy": 0.9126953399181366,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.4410387526911039,
+ "eval_loss": 0.8089802265167236,
+ "eval_mean_token_accuracy": 0.80925900173875,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 47.9562,
+ "eval_samples_per_second": 34.552,
+ "eval_steps_per_second": 4.337,
+ "step": 2695
+ },
+ {
+ "entropy": 0.335344740152359,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.6364794373512268,
+ "learning_rate": 2.7269993317326242e-05,
+ "loss": 0.25932022094726564,
+ "mean_token_accuracy": 0.9156477284431458,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.283698299229145,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.9388208389282227,
+ "learning_rate": 2.514888002079755e-05,
+ "loss": 0.19749004364013673,
+ "mean_token_accuracy": 0.9353394263982773,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.27516734033823015,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.706503689289093,
+ "learning_rate": 2.3088865738883814e-05,
+ "loss": 0.19110334396362305,
+ "mean_token_accuracy": 0.9379108762741089,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.273246209025383,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.7857301235198975,
+ "learning_rate": 2.1094183285045552e-05,
+ "loss": 0.19094297409057617,
+ "mean_token_accuracy": 0.9369927847385406,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.2801500430703163,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.7793248891830444,
+ "learning_rate": 1.9168931232197576e-05,
+ "loss": 0.19656993865966796,
+ "mean_token_accuracy": 0.9351688891649246,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.27495736733078957,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.8637392520904541,
+ "learning_rate": 1.7317065491168085e-05,
+ "loss": 0.1936025810241699,
+ "mean_token_accuracy": 0.9363743001222611,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.2810053497552872,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.7629940509796143,
+ "learning_rate": 1.554239118229261e-05,
+ "loss": 0.1976767921447754,
+ "mean_token_accuracy": 0.9348012053966522,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.272479218095541,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.8325297832489014,
+ "learning_rate": 1.3848554816844692e-05,
+ "loss": 0.1889443016052246,
+ "mean_token_accuracy": 0.9384464406967163,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.4007443663879083,
+ "eval_loss": 0.8944177031517029,
+ "eval_mean_token_accuracy": 0.8053252046497968,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 47.9727,
+ "eval_samples_per_second": 34.54,
+ "eval_steps_per_second": 4.336,
+ "step": 3080
+ },
+ {
+ "entropy": 0.2529501300305128,
+ "epoch": 8.051948051948052,
+ "grad_norm": 0.7490427494049072,
+ "learning_rate": 1.2239036804368287e-05,
+ "loss": 0.1695658302307129,
+ "mean_token_accuracy": 0.9450542360544205,
+ "num_tokens": 7903348.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.2297460925579071,
+ "epoch": 8.181818181818182,
+ "grad_norm": 0.7783088088035583,
+ "learning_rate": 1.0717144301307847e-05,
+ "loss": 0.13803850173950194,
+ "mean_token_accuracy": 0.9550822985172271,
+ "num_tokens": 8028778.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.230171779692173,
+ "epoch": 8.311688311688311,
+ "grad_norm": 0.8110005259513855,
+ "learning_rate": 9.286004415629994e-06,
+ "loss": 0.1422537612915039,
+ "mean_token_accuracy": 0.9539634013175964,
+ "num_tokens": 8153699.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.22410715252161026,
+ "epoch": 8.441558441558442,
+ "grad_norm": 0.6850584745407104,
+ "learning_rate": 7.948557781399818e-06,
+ "loss": 0.13708532333374024,
+ "mean_token_accuracy": 0.9560226953029632,
+ "num_tokens": 8281408.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.23170628443360328,
+ "epoch": 8.571428571428571,
+ "grad_norm": 0.7979664206504822,
+ "learning_rate": 6.707552516514227e-06,
+ "loss": 0.14379706382751464,
+ "mean_token_accuracy": 0.9544130796194077,
+ "num_tokens": 8400657.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2197262801229954,
+ "epoch": 8.7012987012987,
+ "grad_norm": 0.6074482202529907,
+ "learning_rate": 5.565538576007922e-06,
+ "loss": 0.13595272064208985,
+ "mean_token_accuracy": 0.9567223310470581,
+ "num_tokens": 8533080.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.2189832380414009,
+ "epoch": 8.831168831168831,
+ "grad_norm": 0.7105876207351685,
+ "learning_rate": 4.5248625125343745e-06,
+ "loss": 0.13385194778442383,
+ "mean_token_accuracy": 0.9567512100934983,
+ "num_tokens": 8664922.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.2234116178750992,
+ "epoch": 8.96103896103896,
+ "grad_norm": 0.7933406829833984,
+ "learning_rate": 3.587662654787801e-06,
+ "loss": 0.13944730758666993,
+ "mean_token_accuracy": 0.9551076376438141,
+ "num_tokens": 8797714.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.36579762743069577,
+ "eval_loss": 0.9979091286659241,
+ "eval_mean_token_accuracy": 0.8024594216392591,
+ "eval_num_tokens": 8833950.0,
+ "eval_runtime": 47.9914,
+ "eval_samples_per_second": 34.527,
+ "eval_steps_per_second": 4.334,
+ "step": 3465
+ },
+ {
+ "entropy": 0.21086626052856444,
+ "epoch": 9.090909090909092,
+ "grad_norm": 0.7791101336479187,
+ "learning_rate": 2.7558647137731255e-06,
+ "loss": 0.122357816696167,
+ "mean_token_accuracy": 0.962859439253807,
+ "num_tokens": 8925069.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.19901559188961981,
+ "epoch": 9.220779220779221,
+ "grad_norm": 0.6296378970146179,
+ "learning_rate": 2.0311778259521985e-06,
+ "loss": 0.11158108711242676,
+ "mean_token_accuracy": 0.9652415263652802,
+ "num_tokens": 9056953.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.2004897651076317,
+ "epoch": 9.35064935064935,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 1.4150910413963161e-06,
+ "loss": 0.1120915412902832,
+ "mean_token_accuracy": 0.9652277189493179,
+ "num_tokens": 9186990.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.20551898300647736,
+ "epoch": 9.480519480519481,
+ "grad_norm": 0.6944624781608582,
+ "learning_rate": 9.088702641613061e-07,
+ "loss": 0.11612151145935058,
+ "mean_token_accuracy": 0.9643786966800689,
+ "num_tokens": 9311897.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.20479844331741334,
+ "epoch": 9.61038961038961,
+ "grad_norm": 0.7582993507385254,
+ "learning_rate": 5.135556511716324e-07,
+ "loss": 0.11483741760253906,
+ "mean_token_accuracy": 0.9642562127113342,
+ "num_tokens": 9434396.0,
+ "step": 3700
+ },
+ {
+ "entropy": 0.20504073575139045,
+ "epoch": 9.74025974025974,
+ "grad_norm": 0.7992972135543823,
+ "learning_rate": 2.299594749584497e-07,
+ "loss": 0.11569642066955567,
+ "mean_token_accuracy": 0.9646343672275544,
+ "num_tokens": 9560766.0,
+ "step": 3750
+ },
+ {
+ "entropy": 0.20401015728712082,
+ "epoch": 9.87012987012987,
+ "grad_norm": 0.8295965790748596,
+ "learning_rate": 5.866445464296065e-08,
+ "loss": 0.11431631088256836,
+ "mean_token_accuracy": 0.9642905777692795,
+ "num_tokens": 9686573.0,
+ "step": 3800
+ },
+ {
+ "entropy": 0.2005618315190077,
+ "epoch": 10.0,
+ "grad_norm": 0.9638449549674988,
+ "learning_rate": 2.255859454874737e-11,
+ "loss": 0.11225462913513183,
+ "mean_token_accuracy": 0.9657586789131165,
+ "num_tokens": 9815500.0,
+ "step": 3850
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.3506911682824676,
+ "eval_loss": 1.0614067316055298,
+ "eval_mean_token_accuracy": 0.8005969426952876,
+ "eval_num_tokens": 9815500.0,
+ "eval_runtime": 47.9746,
+ "eval_samples_per_second": 34.539,
+ "eval_steps_per_second": 4.336,
+ "step": 3850
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.436930629170852e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b529d53848971eda3155b95c185289489ac0b8
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.04641649878824187,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a50918f7a56462b63b56fd132799a700284faa1
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-770/trainer_state.json
@@ -0,0 +1,206 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 770,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.0450534927845,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.9224470853805542,
+ "learning_rate": 1.3970570546126444e-05,
+ "loss": 1.944740753173828,
+ "mean_token_accuracy": 0.6158743992447853,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 1.1079417771101,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6827074289321899,
+ "learning_rate": 2.822625477686771e-05,
+ "loss": 1.0624147033691407,
+ "mean_token_accuracy": 0.7396554726362229,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8689985585212707,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.6320164203643799,
+ "learning_rate": 4.248193900760899e-05,
+ "loss": 0.8256858825683594,
+ "mean_token_accuracy": 0.7825630265474319,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7946180325746536,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.7246975302696228,
+ "learning_rate": 5.6737623238350247e-05,
+ "loss": 0.7475,
+ "mean_token_accuracy": 0.7959125518798829,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.7919283157587051,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.7635198831558228,
+ "learning_rate": 7.099330746909153e-05,
+ "loss": 0.737699966430664,
+ "mean_token_accuracy": 0.798919832110405,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7713775283098221,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.5751814842224121,
+ "learning_rate": 8.52489916998328e-05,
+ "loss": 0.7186790466308594,
+ "mean_token_accuracy": 0.8014196193218232,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.737273331284523,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.4956009089946747,
+ "learning_rate": 9.950467593057406e-05,
+ "loss": 0.6904002380371094,
+ "mean_token_accuracy": 0.8083628410100937,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7681069333965962,
+ "eval_loss": 0.7430074214935303,
+ "eval_mean_token_accuracy": 0.7941452006881053,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 47.9814,
+ "eval_samples_per_second": 34.534,
+ "eval_steps_per_second": 4.335,
+ "step": 385
+ },
+ {
+ "entropy": 0.7167584246397019,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.638637900352478,
+ "learning_rate": 0.00010976434715123926,
+ "loss": 0.6611510467529297,
+ "mean_token_accuracy": 0.8145374125242233,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6893188625574111,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.554688036441803,
+ "learning_rate": 0.00010967639449071182,
+ "loss": 0.6347718811035157,
+ "mean_token_accuracy": 0.8226255452632905,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6728485250473022,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.5104691386222839,
+ "learning_rate": 0.00010947585797076785,
+ "loss": 0.6233362579345703,
+ "mean_token_accuracy": 0.8227335858345032,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6812490409612656,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.4700304865837097,
+ "learning_rate": 0.00010916314964373551,
+ "loss": 0.6289351654052734,
+ "mean_token_accuracy": 0.8217820060253144,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6633523851633072,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.5094137191772461,
+ "learning_rate": 0.0001087389120469154,
+ "loss": 0.6168266296386719,
+ "mean_token_accuracy": 0.8232935756444931,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.6454597359895706,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.40546780824661255,
+ "learning_rate": 0.00010820401688232725,
+ "loss": 0.5995024108886718,
+ "mean_token_accuracy": 0.8284876370429992,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6700882267951965,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.35203394293785095,
+ "learning_rate": 0.00010755956322558065,
+ "loss": 0.616350212097168,
+ "mean_token_accuracy": 0.8246171402931214,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6587968200445176,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5248188376426697,
+ "learning_rate": 0.00010680687526754984,
+ "loss": 0.608861198425293,
+ "mean_token_accuracy": 0.8265628081560135,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.7101548368541094,
+ "eval_loss": 0.6869887113571167,
+ "eval_mean_token_accuracy": 0.8058141409777678,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 48.0161,
+ "eval_samples_per_second": 34.509,
+ "eval_steps_per_second": 4.332,
+ "step": 770
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0934953976464589e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e1c4b96e90acaf89742a04a298690de735bcc46
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/6okpbvfx)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..01b67b69b00a195e00981eca9a4433d8d03e122d
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json
@@ -0,0 +1,297 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 1155,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.637823526955428e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae0991113444e15d0f169b7fc0963db878251aba
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json
@@ -0,0 +1,378 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 1540,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.1769399455551386e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..23a49374c4be1f93d27cf6023f58c78f7c553b38
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json
@@ -0,0 +1,469 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 1925,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.723982041528156e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ac719a0154a177a028f67b0e5c3ef960a7aca74
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json
@@ -0,0 +1,560 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 2310,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3798492255806923,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.414468377828598,
+ "learning_rate": 0.0001277696944372747,
+ "loss": 0.31735713958740236,
+ "mean_token_accuracy": 0.8974772602319717,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.3367840954661369,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.4515029191970825,
+ "learning_rate": 0.00012278543099892257,
+ "loss": 0.272756290435791,
+ "mean_token_accuracy": 0.9084931749105454,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.34766108095645903,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.4040578007698059,
+ "learning_rate": 0.00011777673349238672,
+ "loss": 0.2792487144470215,
+ "mean_token_accuracy": 0.9065623581409454,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.3566804251074791,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.5059600472450256,
+ "learning_rate": 0.00011275389353671628,
+ "loss": 0.2896596145629883,
+ "mean_token_accuracy": 0.9045185309648514,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3462292793393135,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.4664144217967987,
+ "learning_rate": 0.00010772723181015153,
+ "loss": 0.27794593811035156,
+ "mean_token_accuracy": 0.9075321304798126,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.3434346827864647,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.46017780900001526,
+ "learning_rate": 0.00010270707684371499,
+ "loss": 0.2783885383605957,
+ "mean_token_accuracy": 0.9063384455442428,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.341832632124424,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.429608017206192,
+ "learning_rate": 9.77037437986665e-05,
+ "loss": 0.2815263748168945,
+ "mean_token_accuracy": 0.9060239523649216,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3492265248298645,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.5019266605377197,
+ "learning_rate": 9.272751327143021e-05,
+ "loss": 0.2844840621948242,
+ "mean_token_accuracy": 0.9042869365215301,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4365639974578069,
+ "eval_loss": 0.7624168395996094,
+ "eval_mean_token_accuracy": 0.8165808867376584,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 46.5221,
+ "eval_samples_per_second": 35.617,
+ "eval_steps_per_second": 4.471,
+ "step": 2310
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.26902097163009e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6333976c357544886163bf2eea9f10e174b8af47
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json
@@ -0,0 +1,641 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.0,
+ "eval_steps": 500,
+ "global_step": 2695,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3798492255806923,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.414468377828598,
+ "learning_rate": 0.0001277696944372747,
+ "loss": 0.31735713958740236,
+ "mean_token_accuracy": 0.8974772602319717,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.3367840954661369,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.4515029191970825,
+ "learning_rate": 0.00012278543099892257,
+ "loss": 0.272756290435791,
+ "mean_token_accuracy": 0.9084931749105454,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.34766108095645903,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.4040578007698059,
+ "learning_rate": 0.00011777673349238672,
+ "loss": 0.2792487144470215,
+ "mean_token_accuracy": 0.9065623581409454,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.3566804251074791,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.5059600472450256,
+ "learning_rate": 0.00011275389353671628,
+ "loss": 0.2896596145629883,
+ "mean_token_accuracy": 0.9045185309648514,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3462292793393135,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.4664144217967987,
+ "learning_rate": 0.00010772723181015153,
+ "loss": 0.27794593811035156,
+ "mean_token_accuracy": 0.9075321304798126,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.3434346827864647,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.46017780900001526,
+ "learning_rate": 0.00010270707684371499,
+ "loss": 0.2783885383605957,
+ "mean_token_accuracy": 0.9063384455442428,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.341832632124424,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.429608017206192,
+ "learning_rate": 9.77037437986665e-05,
+ "loss": 0.2815263748168945,
+ "mean_token_accuracy": 0.9060239523649216,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3492265248298645,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.5019266605377197,
+ "learning_rate": 9.272751327143021e-05,
+ "loss": 0.2844840621948242,
+ "mean_token_accuracy": 0.9042869365215301,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4365639974578069,
+ "eval_loss": 0.7624168395996094,
+ "eval_mean_token_accuracy": 0.8165808867376584,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 46.5221,
+ "eval_samples_per_second": 35.617,
+ "eval_steps_per_second": 4.471,
+ "step": 2310
+ },
+ {
+ "entropy": 0.28290177062153815,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.5457249283790588,
+ "learning_rate": 8.77886101695435e-05,
+ "loss": 0.2029383087158203,
+ "mean_token_accuracy": 0.9317537224292756,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2539422053098679,
+ "epoch": 6.233766233766234,
+ "grad_norm": 1.7734259366989136,
+ "learning_rate": 8.289718270203239e-05,
+ "loss": 0.1847425079345703,
+ "mean_token_accuracy": 0.937881036400795,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.2555230759084225,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.49776697158813477,
+ "learning_rate": 7.806328152738371e-05,
+ "loss": 0.18783441543579102,
+ "mean_token_accuracy": 0.936203356385231,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.2529813493788242,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.4968299865722656,
+ "learning_rate": 7.32968391019587e-05,
+ "loss": 0.18458471298217774,
+ "mean_token_accuracy": 0.9365487760305404,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.26623836129903794,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.6177894473075867,
+ "learning_rate": 6.860764927128271e-05,
+ "loss": 0.19328956604003905,
+ "mean_token_accuracy": 0.9330078029632568,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.2671951600909233,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.6792670488357544,
+ "learning_rate": 6.400534714614501e-05,
+ "loss": 0.19405254364013672,
+ "mean_token_accuracy": 0.9335101181268692,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.2522234851121902,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.4798950254917145,
+ "learning_rate": 5.949938930485951e-05,
+ "loss": 0.1846565818786621,
+ "mean_token_accuracy": 0.9369161009788514,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3764066012719503,
+ "eval_loss": 0.8554975390434265,
+ "eval_mean_token_accuracy": 0.8132733049301001,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 46.525,
+ "eval_samples_per_second": 35.615,
+ "eval_steps_per_second": 4.471,
+ "step": 2695
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.810735902601032e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9416cb221138e648d560c5a35124f78102baaacf
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json
@@ -0,0 +1,732 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 500,
+ "global_step": 3080,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3798492255806923,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.414468377828598,
+ "learning_rate": 0.0001277696944372747,
+ "loss": 0.31735713958740236,
+ "mean_token_accuracy": 0.8974772602319717,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.3367840954661369,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.4515029191970825,
+ "learning_rate": 0.00012278543099892257,
+ "loss": 0.272756290435791,
+ "mean_token_accuracy": 0.9084931749105454,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.34766108095645903,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.4040578007698059,
+ "learning_rate": 0.00011777673349238672,
+ "loss": 0.2792487144470215,
+ "mean_token_accuracy": 0.9065623581409454,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.3566804251074791,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.5059600472450256,
+ "learning_rate": 0.00011275389353671628,
+ "loss": 0.2896596145629883,
+ "mean_token_accuracy": 0.9045185309648514,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3462292793393135,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.4664144217967987,
+ "learning_rate": 0.00010772723181015153,
+ "loss": 0.27794593811035156,
+ "mean_token_accuracy": 0.9075321304798126,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.3434346827864647,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.46017780900001526,
+ "learning_rate": 0.00010270707684371499,
+ "loss": 0.2783885383605957,
+ "mean_token_accuracy": 0.9063384455442428,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.341832632124424,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.429608017206192,
+ "learning_rate": 9.77037437986665e-05,
+ "loss": 0.2815263748168945,
+ "mean_token_accuracy": 0.9060239523649216,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3492265248298645,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.5019266605377197,
+ "learning_rate": 9.272751327143021e-05,
+ "loss": 0.2844840621948242,
+ "mean_token_accuracy": 0.9042869365215301,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4365639974578069,
+ "eval_loss": 0.7624168395996094,
+ "eval_mean_token_accuracy": 0.8165808867376584,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 46.5221,
+ "eval_samples_per_second": 35.617,
+ "eval_steps_per_second": 4.471,
+ "step": 2310
+ },
+ {
+ "entropy": 0.28290177062153815,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.5457249283790588,
+ "learning_rate": 8.77886101695435e-05,
+ "loss": 0.2029383087158203,
+ "mean_token_accuracy": 0.9317537224292756,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2539422053098679,
+ "epoch": 6.233766233766234,
+ "grad_norm": 1.7734259366989136,
+ "learning_rate": 8.289718270203239e-05,
+ "loss": 0.1847425079345703,
+ "mean_token_accuracy": 0.937881036400795,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.2555230759084225,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.49776697158813477,
+ "learning_rate": 7.806328152738371e-05,
+ "loss": 0.18783441543579102,
+ "mean_token_accuracy": 0.936203356385231,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.2529813493788242,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.4968299865722656,
+ "learning_rate": 7.32968391019587e-05,
+ "loss": 0.18458471298217774,
+ "mean_token_accuracy": 0.9365487760305404,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.26623836129903794,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.6177894473075867,
+ "learning_rate": 6.860764927128271e-05,
+ "loss": 0.19328956604003905,
+ "mean_token_accuracy": 0.9330078029632568,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.2671951600909233,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.6792670488357544,
+ "learning_rate": 6.400534714614501e-05,
+ "loss": 0.19405254364013672,
+ "mean_token_accuracy": 0.9335101181268692,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.2522234851121902,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.4798950254917145,
+ "learning_rate": 5.949938930485951e-05,
+ "loss": 0.1846565818786621,
+ "mean_token_accuracy": 0.9369161009788514,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3764066012719503,
+ "eval_loss": 0.8554975390434265,
+ "eval_mean_token_accuracy": 0.8132733049301001,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 46.525,
+ "eval_samples_per_second": 35.615,
+ "eval_steps_per_second": 4.471,
+ "step": 2695
+ },
+ {
+ "entropy": 0.24576591402292253,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.3926205039024353,
+ "learning_rate": 5.5099034362364085e-05,
+ "loss": 0.1780208969116211,
+ "mean_token_accuracy": 0.9398036235570908,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.18769787922501563,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.5062244534492493,
+ "learning_rate": 5.0813323946085895e-05,
+ "loss": 0.115099458694458,
+ "mean_token_accuracy": 0.961904166340828,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.1834849800169468,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.4319317638874054,
+ "learning_rate": 4.665106411766087e-05,
+ "loss": 0.11364558219909668,
+ "mean_token_accuracy": 0.9627783286571503,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.17929002813994885,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.4122151732444763,
+ "learning_rate": 4.2620807278682855e-05,
+ "loss": 0.11115352630615234,
+ "mean_token_accuracy": 0.9625132656097413,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.18764832600951195,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.4621254801750183,
+ "learning_rate": 3.873083459765971e-05,
+ "loss": 0.11578564643859864,
+ "mean_token_accuracy": 0.9611306923627854,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.18220983803272248,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.45269420742988586,
+ "learning_rate": 3.498913899428605e-05,
+ "loss": 0.11399910926818847,
+ "mean_token_accuracy": 0.961864430308342,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.18482646018266677,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.4839811325073242,
+ "learning_rate": 3.1403408715994884e-05,
+ "loss": 0.11555064201354981,
+ "mean_token_accuracy": 0.9611130750179291,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.18274843357503415,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.46752479672431946,
+ "learning_rate": 2.798101154053465e-05,
+ "loss": 0.11180784225463868,
+ "mean_token_accuracy": 0.9621868497133255,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.32566706384890354,
+ "eval_loss": 0.9837347865104675,
+ "eval_mean_token_accuracy": 0.8104732755858165,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 46.5488,
+ "eval_samples_per_second": 35.597,
+ "eval_steps_per_second": 4.468,
+ "step": 3080
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.3542904203667354e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3c2c962325ef43efd7ef06e9ad3a6df8d17f877
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3465/trainer_state.json
@@ -0,0 +1,823 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3465,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3798492255806923,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.414468377828598,
+ "learning_rate": 0.0001277696944372747,
+ "loss": 0.31735713958740236,
+ "mean_token_accuracy": 0.8974772602319717,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.3367840954661369,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.4515029191970825,
+ "learning_rate": 0.00012278543099892257,
+ "loss": 0.272756290435791,
+ "mean_token_accuracy": 0.9084931749105454,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.34766108095645903,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.4040578007698059,
+ "learning_rate": 0.00011777673349238672,
+ "loss": 0.2792487144470215,
+ "mean_token_accuracy": 0.9065623581409454,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.3566804251074791,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.5059600472450256,
+ "learning_rate": 0.00011275389353671628,
+ "loss": 0.2896596145629883,
+ "mean_token_accuracy": 0.9045185309648514,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3462292793393135,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.4664144217967987,
+ "learning_rate": 0.00010772723181015153,
+ "loss": 0.27794593811035156,
+ "mean_token_accuracy": 0.9075321304798126,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.3434346827864647,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.46017780900001526,
+ "learning_rate": 0.00010270707684371499,
+ "loss": 0.2783885383605957,
+ "mean_token_accuracy": 0.9063384455442428,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.341832632124424,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.429608017206192,
+ "learning_rate": 9.77037437986665e-05,
+ "loss": 0.2815263748168945,
+ "mean_token_accuracy": 0.9060239523649216,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3492265248298645,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.5019266605377197,
+ "learning_rate": 9.272751327143021e-05,
+ "loss": 0.2844840621948242,
+ "mean_token_accuracy": 0.9042869365215301,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4365639974578069,
+ "eval_loss": 0.7624168395996094,
+ "eval_mean_token_accuracy": 0.8165808867376584,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 46.5221,
+ "eval_samples_per_second": 35.617,
+ "eval_steps_per_second": 4.471,
+ "step": 2310
+ },
+ {
+ "entropy": 0.28290177062153815,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.5457249283790588,
+ "learning_rate": 8.77886101695435e-05,
+ "loss": 0.2029383087158203,
+ "mean_token_accuracy": 0.9317537224292756,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2539422053098679,
+ "epoch": 6.233766233766234,
+ "grad_norm": 1.7734259366989136,
+ "learning_rate": 8.289718270203239e-05,
+ "loss": 0.1847425079345703,
+ "mean_token_accuracy": 0.937881036400795,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.2555230759084225,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.49776697158813477,
+ "learning_rate": 7.806328152738371e-05,
+ "loss": 0.18783441543579102,
+ "mean_token_accuracy": 0.936203356385231,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.2529813493788242,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.4968299865722656,
+ "learning_rate": 7.32968391019587e-05,
+ "loss": 0.18458471298217774,
+ "mean_token_accuracy": 0.9365487760305404,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.26623836129903794,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.6177894473075867,
+ "learning_rate": 6.860764927128271e-05,
+ "loss": 0.19328956604003905,
+ "mean_token_accuracy": 0.9330078029632568,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.2671951600909233,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.6792670488357544,
+ "learning_rate": 6.400534714614501e-05,
+ "loss": 0.19405254364013672,
+ "mean_token_accuracy": 0.9335101181268692,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.2522234851121902,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.4798950254917145,
+ "learning_rate": 5.949938930485951e-05,
+ "loss": 0.1846565818786621,
+ "mean_token_accuracy": 0.9369161009788514,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3764066012719503,
+ "eval_loss": 0.8554975390434265,
+ "eval_mean_token_accuracy": 0.8132733049301001,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 46.525,
+ "eval_samples_per_second": 35.615,
+ "eval_steps_per_second": 4.471,
+ "step": 2695
+ },
+ {
+ "entropy": 0.24576591402292253,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.3926205039024353,
+ "learning_rate": 5.5099034362364085e-05,
+ "loss": 0.1780208969116211,
+ "mean_token_accuracy": 0.9398036235570908,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.18769787922501563,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.5062244534492493,
+ "learning_rate": 5.0813323946085895e-05,
+ "loss": 0.115099458694458,
+ "mean_token_accuracy": 0.961904166340828,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.1834849800169468,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.4319317638874054,
+ "learning_rate": 4.665106411766087e-05,
+ "loss": 0.11364558219909668,
+ "mean_token_accuracy": 0.9627783286571503,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.17929002813994885,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.4122151732444763,
+ "learning_rate": 4.2620807278682855e-05,
+ "loss": 0.11115352630615234,
+ "mean_token_accuracy": 0.9625132656097413,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.18764832600951195,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.4621254801750183,
+ "learning_rate": 3.873083459765971e-05,
+ "loss": 0.11578564643859864,
+ "mean_token_accuracy": 0.9611306923627854,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.18220983803272248,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.45269420742988586,
+ "learning_rate": 3.498913899428605e-05,
+ "loss": 0.11399910926818847,
+ "mean_token_accuracy": 0.961864430308342,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.18482646018266677,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.4839811325073242,
+ "learning_rate": 3.1403408715994884e-05,
+ "loss": 0.11555064201354981,
+ "mean_token_accuracy": 0.9611130750179291,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.18274843357503415,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.46752479672431946,
+ "learning_rate": 2.798101154053465e-05,
+ "loss": 0.11180784225463868,
+ "mean_token_accuracy": 0.9621868497133255,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.32566706384890354,
+ "eval_loss": 0.9837347865104675,
+ "eval_mean_token_accuracy": 0.8104732755858165,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 46.5488,
+ "eval_samples_per_second": 35.597,
+ "eval_steps_per_second": 4.468,
+ "step": 3080
+ },
+ {
+ "entropy": 0.1633647498488426,
+ "epoch": 8.051948051948052,
+ "grad_norm": 0.4305579960346222,
+ "learning_rate": 2.472897963703081e-05,
+ "loss": 0.09586874961853027,
+ "mean_token_accuracy": 0.9682459622621536,
+ "num_tokens": 7903348.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.14545966424047946,
+ "epoch": 8.181818181818182,
+ "grad_norm": 0.3315879702568054,
+ "learning_rate": 2.1653995116639546e-05,
+ "loss": 0.07627681255340577,
+ "mean_token_accuracy": 0.9752496027946472,
+ "num_tokens": 8028778.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.14618608497083188,
+ "epoch": 8.311688311688311,
+ "grad_norm": 0.30432993173599243,
+ "learning_rate": 1.876237630248263e-05,
+ "loss": 0.07688333988189697,
+ "mean_token_accuracy": 0.9748979198932648,
+ "num_tokens": 8153699.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.14237778432667256,
+ "epoch": 8.441558441558442,
+ "grad_norm": 0.299809068441391,
+ "learning_rate": 1.606006474707584e-05,
+ "loss": 0.07612751007080078,
+ "mean_token_accuracy": 0.9753290069103241,
+ "num_tokens": 8281408.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.14949887059628963,
+ "epoch": 8.571428571428571,
+ "grad_norm": 0.4242253601551056,
+ "learning_rate": 1.355261302392631e-05,
+ "loss": 0.08111579895019531,
+ "mean_token_accuracy": 0.9737774491310119,
+ "num_tokens": 8400657.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.1369019091874361,
+ "epoch": 8.7012987012987,
+ "grad_norm": 0.2756560742855072,
+ "learning_rate": 1.1245173318384599e-05,
+ "loss": 0.07382246494293213,
+ "mean_token_accuracy": 0.9767089641094208,
+ "num_tokens": 8533080.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.13799229875206948,
+ "epoch": 8.831168831168831,
+ "grad_norm": 0.32391059398651123,
+ "learning_rate": 9.14248684119404e-06,
+ "loss": 0.07332521915435791,
+ "mean_token_accuracy": 0.975739398598671,
+ "num_tokens": 8664922.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.13693079218268395,
+ "epoch": 8.96103896103896,
+ "grad_norm": 0.3435397446155548,
+ "learning_rate": 7.248874086490063e-06,
+ "loss": 0.0737720012664795,
+ "mean_token_accuracy": 0.9756388676166534,
+ "num_tokens": 8797714.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2884131854113478,
+ "eval_loss": 1.1368393898010254,
+ "eval_mean_token_accuracy": 0.8096811464772775,
+ "eval_num_tokens": 8833950.0,
+ "eval_runtime": 46.5492,
+ "eval_samples_per_second": 35.597,
+ "eval_steps_per_second": 4.468,
+ "step": 3465
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.8981730000368845e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3689945836de870b77e8a2338a315e50eafcb091
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-385/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 385,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.429451631126118e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b04683da5d407a815bb860c3c5a1b481e272edd7
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3850/trainer_state.json
@@ -0,0 +1,914 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3850,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ },
+ {
+ "entropy": 0.6113292586803436,
+ "epoch": 2.0779220779220777,
+ "grad_norm": 0.37614893913269043,
+ "learning_rate": 0.00021406697290972404,
+ "loss": 0.5562425231933594,
+ "mean_token_accuracy": 0.8369060623645782,
+ "num_tokens": 2035968.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.5720089420676231,
+ "epoch": 2.207792207792208,
+ "grad_norm": 0.35560500621795654,
+ "learning_rate": 0.00021211860917768236,
+ "loss": 0.521314697265625,
+ "mean_token_accuracy": 0.8453685063123703,
+ "num_tokens": 2168591.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.5757493850588798,
+ "epoch": 2.3376623376623376,
+ "grad_norm": 0.304592490196228,
+ "learning_rate": 0.00020996225406798486,
+ "loss": 0.5258681106567383,
+ "mean_token_accuracy": 0.8431038129329681,
+ "num_tokens": 2297967.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.5790414094924927,
+ "epoch": 2.4675324675324677,
+ "grad_norm": 0.29762616753578186,
+ "learning_rate": 0.00020760233835036664,
+ "loss": 0.5219763565063477,
+ "mean_token_accuracy": 0.8438457292318344,
+ "num_tokens": 2425614.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.5783660891652107,
+ "epoch": 2.5974025974025974,
+ "grad_norm": 0.3418065011501312,
+ "learning_rate": 0.00020504371106063417,
+ "loss": 0.5258687210083007,
+ "mean_token_accuracy": 0.8422278153896332,
+ "num_tokens": 2557421.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5746926316618919,
+ "epoch": 2.7272727272727275,
+ "grad_norm": 0.3708217740058899,
+ "learning_rate": 0.00020229162953711157,
+ "loss": 0.5161260223388672,
+ "mean_token_accuracy": 0.8453844922780991,
+ "num_tokens": 2680379.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.5676264691352845,
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.26185691356658936,
+ "learning_rate": 0.00019935174861812654,
+ "loss": 0.5179851913452148,
+ "mean_token_accuracy": 0.8455151951313019,
+ "num_tokens": 2810854.0,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5804974803328514,
+ "epoch": 2.987012987012987,
+ "grad_norm": 0.3131248652935028,
+ "learning_rate": 0.00019623010902273397,
+ "loss": 0.5243957138061524,
+ "mean_token_accuracy": 0.8455932134389877,
+ "num_tokens": 2932637.0,
+ "step": 1150
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.6151215266436338,
+ "eval_loss": 0.6557295918464661,
+ "eval_mean_token_accuracy": 0.8159411993737404,
+ "eval_num_tokens": 2944650.0,
+ "eval_runtime": 46.5152,
+ "eval_samples_per_second": 35.623,
+ "eval_steps_per_second": 4.472,
+ "step": 1155
+ },
+ {
+ "entropy": 0.5100037640333176,
+ "epoch": 3.116883116883117,
+ "grad_norm": 0.32568028569221497,
+ "learning_rate": 0.00019293312493855094,
+ "loss": 0.4522856140136719,
+ "mean_token_accuracy": 0.8600019490718842,
+ "num_tokens": 3054046.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5058341425657272,
+ "epoch": 3.2467532467532467,
+ "grad_norm": 0.3810749650001526,
+ "learning_rate": 0.00018946757084220762,
+ "loss": 0.44891536712646485,
+ "mean_token_accuracy": 0.8606845206022262,
+ "num_tokens": 3182515.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.5006153827905655,
+ "epoch": 3.3766233766233764,
+ "grad_norm": 0.35473746061325073,
+ "learning_rate": 0.0001858405675794941,
+ "loss": 0.451065673828125,
+ "mean_token_accuracy": 0.8608392387628555,
+ "num_tokens": 3313672.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.508549126982689,
+ "epoch": 3.5064935064935066,
+ "grad_norm": 0.3414103388786316,
+ "learning_rate": 0.00018205956773380578,
+ "loss": 0.4535030746459961,
+ "mean_token_accuracy": 0.859148946404457,
+ "num_tokens": 3440689.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.516265479028225,
+ "epoch": 3.6363636363636362,
+ "grad_norm": 0.4597613513469696,
+ "learning_rate": 0.00017813234031295068,
+ "loss": 0.45882129669189453,
+ "mean_token_accuracy": 0.8582911169528962,
+ "num_tokens": 3566427.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4902383416891098,
+ "epoch": 3.7662337662337664,
+ "grad_norm": 0.3385583460330963,
+ "learning_rate": 0.0001740669547857841,
+ "loss": 0.4417523193359375,
+ "mean_token_accuracy": 0.8632881045341492,
+ "num_tokens": 3699976.0,
+ "step": 1450
+ },
+ {
+ "entropy": 0.5104234129190445,
+ "epoch": 3.896103896103896,
+ "grad_norm": 0.3935626149177551,
+ "learning_rate": 0.00016987176450147088,
+ "loss": 0.4547672653198242,
+ "mean_token_accuracy": 0.8592299193143844,
+ "num_tokens": 3827926.0,
+ "step": 1500
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.565094108478381,
+ "eval_loss": 0.6673083305358887,
+ "eval_mean_token_accuracy": 0.8179781915476689,
+ "eval_num_tokens": 3926200.0,
+ "eval_runtime": 46.4987,
+ "eval_samples_per_second": 35.635,
+ "eval_steps_per_second": 4.473,
+ "step": 1540
+ },
+ {
+ "entropy": 0.49345644742250444,
+ "epoch": 4.025974025974026,
+ "grad_norm": 0.42027363181114197,
+ "learning_rate": 0.00016555538952544487,
+ "loss": 0.4355708312988281,
+ "mean_token_accuracy": 0.8648384511470795,
+ "num_tokens": 3953109.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.4247111546993256,
+ "epoch": 4.1558441558441555,
+ "grad_norm": 0.4478093087673187,
+ "learning_rate": 0.00016112669892733307,
+ "loss": 0.36362716674804685,
+ "mean_token_accuracy": 0.8844931781291961,
+ "num_tokens": 4081079.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.42337420970201495,
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.35812026262283325,
+ "learning_rate": 0.00015659479255723875,
+ "loss": 0.3651982498168945,
+ "mean_token_accuracy": 0.8830066406726838,
+ "num_tokens": 4206426.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.43247521698474883,
+ "epoch": 4.415584415584416,
+ "grad_norm": 0.3665122985839844,
+ "learning_rate": 0.0001519689823478283,
+ "loss": 0.37368186950683596,
+ "mean_token_accuracy": 0.8800795775651932,
+ "num_tokens": 4329839.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.4328634282946587,
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.33961260318756104,
+ "learning_rate": 0.00014725877318064152,
+ "loss": 0.37599964141845704,
+ "mean_token_accuracy": 0.8797144430875778,
+ "num_tokens": 4453940.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.42742862343788146,
+ "epoch": 4.675324675324675,
+ "grad_norm": 0.4312469959259033,
+ "learning_rate": 0.0001424738433559405,
+ "loss": 0.37354656219482424,
+ "mean_token_accuracy": 0.880826217532158,
+ "num_tokens": 4583626.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.4280877533555031,
+ "epoch": 4.805194805194805,
+ "grad_norm": 0.4343482255935669,
+ "learning_rate": 0.0001376240247062263,
+ "loss": 0.3688441467285156,
+ "mean_token_accuracy": 0.8814105206727981,
+ "num_tokens": 4713540.0,
+ "step": 1850
+ },
+ {
+ "entropy": 0.42952129155397417,
+ "epoch": 4.935064935064935,
+ "grad_norm": 0.47131651639938354,
+ "learning_rate": 0.00013271928239428512,
+ "loss": 0.37270416259765626,
+ "mean_token_accuracy": 0.8788579875230789,
+ "num_tokens": 4845678.0,
+ "step": 1900
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.5064233084424183,
+ "eval_loss": 0.689122200012207,
+ "eval_mean_token_accuracy": 0.818150080453891,
+ "eval_num_tokens": 4907750.0,
+ "eval_runtime": 46.5114,
+ "eval_samples_per_second": 35.626,
+ "eval_steps_per_second": 4.472,
+ "step": 1925
+ },
+ {
+ "entropy": 0.3798492255806923,
+ "epoch": 5.064935064935065,
+ "grad_norm": 0.414468377828598,
+ "learning_rate": 0.0001277696944372747,
+ "loss": 0.31735713958740236,
+ "mean_token_accuracy": 0.8974772602319717,
+ "num_tokens": 4975955.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.3367840954661369,
+ "epoch": 5.194805194805195,
+ "grad_norm": 0.4515029191970825,
+ "learning_rate": 0.00012278543099892257,
+ "loss": 0.272756290435791,
+ "mean_token_accuracy": 0.9084931749105454,
+ "num_tokens": 5104770.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.34766108095645903,
+ "epoch": 5.324675324675325,
+ "grad_norm": 0.4040578007698059,
+ "learning_rate": 0.00011777673349238672,
+ "loss": 0.2792487144470215,
+ "mean_token_accuracy": 0.9065623581409454,
+ "num_tokens": 5229052.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.3566804251074791,
+ "epoch": 5.454545454545454,
+ "grad_norm": 0.5059600472450256,
+ "learning_rate": 0.00011275389353671628,
+ "loss": 0.2896596145629883,
+ "mean_token_accuracy": 0.9045185309648514,
+ "num_tokens": 5357753.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3462292793393135,
+ "epoch": 5.584415584415584,
+ "grad_norm": 0.4664144217967987,
+ "learning_rate": 0.00010772723181015153,
+ "loss": 0.27794593811035156,
+ "mean_token_accuracy": 0.9075321304798126,
+ "num_tokens": 5481550.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.3434346827864647,
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.46017780900001526,
+ "learning_rate": 0.00010270707684371499,
+ "loss": 0.2783885383605957,
+ "mean_token_accuracy": 0.9063384455442428,
+ "num_tokens": 5609104.0,
+ "step": 2200
+ },
+ {
+ "entropy": 0.341832632124424,
+ "epoch": 5.8441558441558445,
+ "grad_norm": 0.429608017206192,
+ "learning_rate": 9.77037437986665e-05,
+ "loss": 0.2815263748168945,
+ "mean_token_accuracy": 0.9060239523649216,
+ "num_tokens": 5737347.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.3492265248298645,
+ "epoch": 5.974025974025974,
+ "grad_norm": 0.5019266605377197,
+ "learning_rate": 9.272751327143021e-05,
+ "loss": 0.2844840621948242,
+ "mean_token_accuracy": 0.9042869365215301,
+ "num_tokens": 5861872.0,
+ "step": 2300
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.4365639974578069,
+ "eval_loss": 0.7624168395996094,
+ "eval_mean_token_accuracy": 0.8165808867376584,
+ "eval_num_tokens": 5889300.0,
+ "eval_runtime": 46.5221,
+ "eval_samples_per_second": 35.617,
+ "eval_steps_per_second": 4.471,
+ "step": 2310
+ },
+ {
+ "entropy": 0.28290177062153815,
+ "epoch": 6.103896103896104,
+ "grad_norm": 0.5457249283790588,
+ "learning_rate": 8.77886101695435e-05,
+ "loss": 0.2029383087158203,
+ "mean_token_accuracy": 0.9317537224292756,
+ "num_tokens": 5990679.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2539422053098679,
+ "epoch": 6.233766233766234,
+ "grad_norm": 1.7734259366989136,
+ "learning_rate": 8.289718270203239e-05,
+ "loss": 0.1847425079345703,
+ "mean_token_accuracy": 0.937881036400795,
+ "num_tokens": 6117918.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.2555230759084225,
+ "epoch": 6.363636363636363,
+ "grad_norm": 0.49776697158813477,
+ "learning_rate": 7.806328152738371e-05,
+ "loss": 0.18783441543579102,
+ "mean_token_accuracy": 0.936203356385231,
+ "num_tokens": 6248566.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.2529813493788242,
+ "epoch": 6.4935064935064934,
+ "grad_norm": 0.4968299865722656,
+ "learning_rate": 7.32968391019587e-05,
+ "loss": 0.18458471298217774,
+ "mean_token_accuracy": 0.9365487760305404,
+ "num_tokens": 6380529.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.26623836129903794,
+ "epoch": 6.623376623376624,
+ "grad_norm": 0.6177894473075867,
+ "learning_rate": 6.860764927128271e-05,
+ "loss": 0.19328956604003905,
+ "mean_token_accuracy": 0.9330078029632568,
+ "num_tokens": 6501669.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.2671951600909233,
+ "epoch": 6.753246753246753,
+ "grad_norm": 0.6792670488357544,
+ "learning_rate": 6.400534714614501e-05,
+ "loss": 0.19405254364013672,
+ "mean_token_accuracy": 0.9335101181268692,
+ "num_tokens": 6624404.0,
+ "step": 2600
+ },
+ {
+ "entropy": 0.2522234851121902,
+ "epoch": 6.883116883116883,
+ "grad_norm": 0.4798950254917145,
+ "learning_rate": 5.949938930485951e-05,
+ "loss": 0.1846565818786621,
+ "mean_token_accuracy": 0.9369161009788514,
+ "num_tokens": 6755532.0,
+ "step": 2650
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3764066012719503,
+ "eval_loss": 0.8554975390434265,
+ "eval_mean_token_accuracy": 0.8132733049301001,
+ "eval_num_tokens": 6870850.0,
+ "eval_runtime": 46.525,
+ "eval_samples_per_second": 35.615,
+ "eval_steps_per_second": 4.471,
+ "step": 2695
+ },
+ {
+ "entropy": 0.24576591402292253,
+ "epoch": 7.012987012987013,
+ "grad_norm": 0.3926205039024353,
+ "learning_rate": 5.5099034362364085e-05,
+ "loss": 0.1780208969116211,
+ "mean_token_accuracy": 0.9398036235570908,
+ "num_tokens": 6884338.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.18769787922501563,
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.5062244534492493,
+ "learning_rate": 5.0813323946085895e-05,
+ "loss": 0.115099458694458,
+ "mean_token_accuracy": 0.961904166340828,
+ "num_tokens": 7008981.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.1834849800169468,
+ "epoch": 7.2727272727272725,
+ "grad_norm": 0.4319317638874054,
+ "learning_rate": 4.665106411766087e-05,
+ "loss": 0.11364558219909668,
+ "mean_token_accuracy": 0.9627783286571503,
+ "num_tokens": 7135402.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.17929002813994885,
+ "epoch": 7.402597402597403,
+ "grad_norm": 0.4122151732444763,
+ "learning_rate": 4.2620807278682855e-05,
+ "loss": 0.11115352630615234,
+ "mean_token_accuracy": 0.9625132656097413,
+ "num_tokens": 7265920.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.18764832600951195,
+ "epoch": 7.532467532467533,
+ "grad_norm": 0.4621254801750183,
+ "learning_rate": 3.873083459765971e-05,
+ "loss": 0.11578564643859864,
+ "mean_token_accuracy": 0.9611306923627854,
+ "num_tokens": 7389633.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.18220983803272248,
+ "epoch": 7.662337662337662,
+ "grad_norm": 0.45269420742988586,
+ "learning_rate": 3.498913899428605e-05,
+ "loss": 0.11399910926818847,
+ "mean_token_accuracy": 0.961864430308342,
+ "num_tokens": 7518696.0,
+ "step": 2950
+ },
+ {
+ "entropy": 0.18482646018266677,
+ "epoch": 7.792207792207792,
+ "grad_norm": 0.4839811325073242,
+ "learning_rate": 3.1403408715994884e-05,
+ "loss": 0.11555064201354981,
+ "mean_token_accuracy": 0.9611130750179291,
+ "num_tokens": 7643525.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.18274843357503415,
+ "epoch": 7.922077922077922,
+ "grad_norm": 0.46752479672431946,
+ "learning_rate": 2.798101154053465e-05,
+ "loss": 0.11180784225463868,
+ "mean_token_accuracy": 0.9621868497133255,
+ "num_tokens": 7773494.0,
+ "step": 3050
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.32566706384890354,
+ "eval_loss": 0.9837347865104675,
+ "eval_mean_token_accuracy": 0.8104732755858165,
+ "eval_num_tokens": 7852400.0,
+ "eval_runtime": 46.5488,
+ "eval_samples_per_second": 35.597,
+ "eval_steps_per_second": 4.468,
+ "step": 3080
+ },
+ {
+ "entropy": 0.1633647498488426,
+ "epoch": 8.051948051948052,
+ "grad_norm": 0.4305579960346222,
+ "learning_rate": 2.472897963703081e-05,
+ "loss": 0.09586874961853027,
+ "mean_token_accuracy": 0.9682459622621536,
+ "num_tokens": 7903348.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.14545966424047946,
+ "epoch": 8.181818181818182,
+ "grad_norm": 0.3315879702568054,
+ "learning_rate": 2.1653995116639546e-05,
+ "loss": 0.07627681255340577,
+ "mean_token_accuracy": 0.9752496027946472,
+ "num_tokens": 8028778.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.14618608497083188,
+ "epoch": 8.311688311688311,
+ "grad_norm": 0.30432993173599243,
+ "learning_rate": 1.876237630248263e-05,
+ "loss": 0.07688333988189697,
+ "mean_token_accuracy": 0.9748979198932648,
+ "num_tokens": 8153699.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.14237778432667256,
+ "epoch": 8.441558441558442,
+ "grad_norm": 0.299809068441391,
+ "learning_rate": 1.606006474707584e-05,
+ "loss": 0.07612751007080078,
+ "mean_token_accuracy": 0.9753290069103241,
+ "num_tokens": 8281408.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.14949887059628963,
+ "epoch": 8.571428571428571,
+ "grad_norm": 0.4242253601551056,
+ "learning_rate": 1.355261302392631e-05,
+ "loss": 0.08111579895019531,
+ "mean_token_accuracy": 0.9737774491310119,
+ "num_tokens": 8400657.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.1369019091874361,
+ "epoch": 8.7012987012987,
+ "grad_norm": 0.2756560742855072,
+ "learning_rate": 1.1245173318384599e-05,
+ "loss": 0.07382246494293213,
+ "mean_token_accuracy": 0.9767089641094208,
+ "num_tokens": 8533080.0,
+ "step": 3350
+ },
+ {
+ "entropy": 0.13799229875206948,
+ "epoch": 8.831168831168831,
+ "grad_norm": 0.32391059398651123,
+ "learning_rate": 9.14248684119404e-06,
+ "loss": 0.07332521915435791,
+ "mean_token_accuracy": 0.975739398598671,
+ "num_tokens": 8664922.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.13693079218268395,
+ "epoch": 8.96103896103896,
+ "grad_norm": 0.3435397446155548,
+ "learning_rate": 7.248874086490063e-06,
+ "loss": 0.0737720012664795,
+ "mean_token_accuracy": 0.9756388676166534,
+ "num_tokens": 8797714.0,
+ "step": 3450
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2884131854113478,
+ "eval_loss": 1.1368393898010254,
+ "eval_mean_token_accuracy": 0.8096811464772775,
+ "eval_num_tokens": 8833950.0,
+ "eval_runtime": 46.5492,
+ "eval_samples_per_second": 35.597,
+ "eval_steps_per_second": 4.468,
+ "step": 3465
+ },
+ {
+ "entropy": 0.13195306338369847,
+ "epoch": 9.090909090909092,
+ "grad_norm": 0.3066045641899109,
+ "learning_rate": 5.568225954266577e-06,
+ "loss": 0.06769096851348877,
+ "mean_token_accuracy": 0.9786338102817536,
+ "num_tokens": 8925069.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1242513469606638,
+ "epoch": 9.220779220779221,
+ "grad_norm": 0.21904544532299042,
+ "learning_rate": 4.103995755551041e-06,
+ "loss": 0.061446948051452635,
+ "mean_token_accuracy": 0.9808032715320587,
+ "num_tokens": 9056953.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.1268651543557644,
+ "epoch": 9.35064935064935,
+ "grad_norm": 0.3049603998661041,
+ "learning_rate": 2.8591921167149736e-06,
+ "loss": 0.06346964359283447,
+ "mean_token_accuracy": 0.9791843616962432,
+ "num_tokens": 9186990.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.1311937213689089,
+ "epoch": 9.480519480519481,
+ "grad_norm": 0.2234215885400772,
+ "learning_rate": 1.8363727975003678e-06,
+ "loss": 0.06572246074676513,
+ "mean_token_accuracy": 0.9789830875396729,
+ "num_tokens": 9311897.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.13369733810424805,
+ "epoch": 9.61038961038961,
+ "grad_norm": 0.2675652503967285,
+ "learning_rate": 1.0376394354638012e-06,
+ "loss": 0.0670329761505127,
+ "mean_token_accuracy": 0.9782129484415054,
+ "num_tokens": 9434396.0,
+ "step": 3700
+ },
+ {
+ "entropy": 0.13093947909772397,
+ "epoch": 9.74025974025974,
+ "grad_norm": 0.24558641016483307,
+ "learning_rate": 4.646332276376641e-07,
+ "loss": 0.06605084896087647,
+ "mean_token_accuracy": 0.9788406610488891,
+ "num_tokens": 9560766.0,
+ "step": 3750
+ },
+ {
+ "entropy": 0.13027613274753094,
+ "epoch": 9.87012987012987,
+ "grad_norm": 0.26439452171325684,
+ "learning_rate": 1.1853155828124577e-07,
+ "loss": 0.06636544704437256,
+ "mean_token_accuracy": 0.9784610909223557,
+ "num_tokens": 9686573.0,
+ "step": 3800
+ },
+ {
+ "entropy": 0.12754634492099284,
+ "epoch": 10.0,
+ "grad_norm": 0.3149493932723999,
+ "learning_rate": 4.5579650927833065e-11,
+ "loss": 0.06343324661254883,
+ "mean_token_accuracy": 0.9797626113891602,
+ "num_tokens": 9815500.0,
+ "step": 3850
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.27749587402034265,
+ "eval_loss": 1.2116529941558838,
+ "eval_mean_token_accuracy": 0.8092864940945919,
+ "eval_num_tokens": 9815500.0,
+ "eval_runtime": 46.5276,
+ "eval_samples_per_second": 35.613,
+ "eval_steps_per_second": 4.47,
+ "step": 3850
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.436930629170852e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d42893a69e354223672a3b0f9de6f2b979c004
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 64,
+ "lora_bias": false,
+ "lora_dropout": 0.05284766149829996,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 64,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "down_proj",
+ "up_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj",
+ "q_proj",
+ "gate_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..96b3397fd4527850dd5d13eff943a1847c023137
--- /dev/null
+++ b/DBCA_original_Estonian/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-770/trainer_state.json
@@ -0,0 +1,206 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 770,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8529596650600433,
+ "epoch": 0.12987012987012986,
+ "grad_norm": 0.878183901309967,
+ "learning_rate": 2.8227544379109735e-05,
+ "loss": 1.762685546875,
+ "mean_token_accuracy": 0.6396001759171486,
+ "num_tokens": 133878.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.9309259909391403,
+ "epoch": 0.2597402597402597,
+ "grad_norm": 0.6279756426811218,
+ "learning_rate": 5.7031161092487016e-05,
+ "loss": 0.8922532653808594,
+ "mean_token_accuracy": 0.768746777176857,
+ "num_tokens": 265533.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.8268384379148483,
+ "epoch": 0.38961038961038963,
+ "grad_norm": 0.5478948354721069,
+ "learning_rate": 8.583477780586432e-05,
+ "loss": 0.7756452178955078,
+ "mean_token_accuracy": 0.7925205600261688,
+ "num_tokens": 386480.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.7658095824718475,
+ "epoch": 0.5194805194805194,
+ "grad_norm": 0.5972853899002075,
+ "learning_rate": 0.00011463839451924158,
+ "loss": 0.7161135101318359,
+ "mean_token_accuracy": 0.8019153982400894,
+ "num_tokens": 520135.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.769344270825386,
+ "epoch": 0.6493506493506493,
+ "grad_norm": 0.5576531887054443,
+ "learning_rate": 0.0001434420112326189,
+ "loss": 0.716611557006836,
+ "mean_token_accuracy": 0.8028716742992401,
+ "num_tokens": 640303.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.7470700722932816,
+ "epoch": 0.7792207792207793,
+ "grad_norm": 0.3937978148460388,
+ "learning_rate": 0.0001722456279459962,
+ "loss": 0.6983388519287109,
+ "mean_token_accuracy": 0.80596988260746,
+ "num_tokens": 766027.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.7172233510017395,
+ "epoch": 0.9090909090909091,
+ "grad_norm": 0.32564300298690796,
+ "learning_rate": 0.00020104924465937345,
+ "loss": 0.6731016540527344,
+ "mean_token_accuracy": 0.8119878542423248,
+ "num_tokens": 894871.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.7397788639825124,
+ "eval_loss": 0.7304292917251587,
+ "eval_mean_token_accuracy": 0.7976073359067624,
+ "eval_num_tokens": 981550.0,
+ "eval_runtime": 46.5688,
+ "eval_samples_per_second": 35.582,
+ "eval_steps_per_second": 4.467,
+ "step": 385
+ },
+ {
+ "entropy": 0.7035581320524216,
+ "epoch": 1.0389610389610389,
+ "grad_norm": 0.45747122168540955,
+ "learning_rate": 0.00022177891520076015,
+ "loss": 0.6450813293457032,
+ "mean_token_accuracy": 0.8178210550546646,
+ "num_tokens": 1021041.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.6679977881908417,
+ "epoch": 1.1688311688311688,
+ "grad_norm": 0.4456019401550293,
+ "learning_rate": 0.0002216012068086725,
+ "loss": 0.6189227294921875,
+ "mean_token_accuracy": 0.8257622331380844,
+ "num_tokens": 1149161.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.6574469250440598,
+ "epoch": 1.2987012987012987,
+ "grad_norm": 0.3784359097480774,
+ "learning_rate": 0.00022119602267552194,
+ "loss": 0.6098381805419922,
+ "mean_token_accuracy": 0.8251238936185836,
+ "num_tokens": 1278079.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.6593573099374771,
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.36508747935295105,
+ "learning_rate": 0.00022056419535323196,
+ "loss": 0.6137563705444335,
+ "mean_token_accuracy": 0.824503253698349,
+ "num_tokens": 1405326.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.6459340593218803,
+ "epoch": 1.5584415584415585,
+ "grad_norm": 0.39773380756378174,
+ "learning_rate": 0.00021970702308872148,
+ "loss": 0.5986767196655274,
+ "mean_token_accuracy": 0.8270765203237533,
+ "num_tokens": 1532507.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.633097175359726,
+ "epoch": 1.6883116883116882,
+ "grad_norm": 0.41731834411621094,
+ "learning_rate": 0.00021862626715633265,
+ "loss": 0.5849922180175782,
+ "mean_token_accuracy": 0.8313153618574143,
+ "num_tokens": 1662747.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.6569280475378036,
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.3067665100097656,
+ "learning_rate": 0.00021732414823885307,
+ "loss": 0.605062370300293,
+ "mean_token_accuracy": 0.8269566106796264,
+ "num_tokens": 1783817.0,
+ "step": 700
+ },
+ {
+ "entropy": 0.6459099870920181,
+ "epoch": 1.948051948051948,
+ "grad_norm": 0.5508913397789001,
+ "learning_rate": 0.00021580334186456886,
+ "loss": 0.596672134399414,
+ "mean_token_accuracy": 0.8287447422742844,
+ "num_tokens": 1914573.0,
+ "step": 750
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.6985598478752834,
+ "eval_loss": 0.6716415286064148,
+ "eval_mean_token_accuracy": 0.808078097895934,
+ "eval_num_tokens": 1963100.0,
+ "eval_runtime": 46.5197,
+ "eval_samples_per_second": 35.619,
+ "eval_steps_per_second": 4.471,
+ "step": 770
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3850,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0934953976464589e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..20bdc3e560a9ea3898b4420cd708cf43b0d087bb
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/zi0h7g34)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8f9e7bf36688d4fb23482908e519b39dfbb8d22
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json
@@ -0,0 +1,287 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 1122,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.451742995129948e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b39d18fb7d2a26a456968dd6b90f8508457b8d69
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json
@@ -0,0 +1,368 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 1496,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.9374623481280205e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a209f85416c7076dcb08e85e0ec91fe1093a3ded
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json
@@ -0,0 +1,459 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 1870,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.415768868260864e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bb85de81c8a519dff0f58100785ea288245bce9
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json
@@ -0,0 +1,540 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 2244,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ },
+ {
+ "entropy": 0.3230800422454121,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.38153186440467834,
+ "learning_rate": 0.00012013673189135029,
+ "loss": 0.2727243995666504,
+ "mean_token_accuracy": 0.9120446022110756,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.2979305517673492,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.42468664050102234,
+ "learning_rate": 0.00011528664314752708,
+ "loss": 0.24437490463256836,
+ "mean_token_accuracy": 0.9198145979642868,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.29811844661831854,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.48722052574157715,
+ "learning_rate": 0.0001104136037788565,
+ "loss": 0.2472528076171875,
+ "mean_token_accuracy": 0.9198214167356491,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3036586672067642,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.4003150165081024,
+ "learning_rate": 0.00010552822420675757,
+ "loss": 0.2524623489379883,
+ "mean_token_accuracy": 0.9182902538776397,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.30276541873812673,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.5082385540008545,
+ "learning_rate": 0.00010064114172186765,
+ "loss": 0.2554252052307129,
+ "mean_token_accuracy": 0.9163929194211959,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.30480867981910703,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.46462953090667725,
+ "learning_rate": 9.57629973226994e-05,
+ "loss": 0.25483154296875,
+ "mean_token_accuracy": 0.916156811118126,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.2954915864765644,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5345046520233154,
+ "learning_rate": 9.090441254622432e-05,
+ "loss": 0.24575115203857423,
+ "mean_token_accuracy": 0.9198049437999726,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.37039283126592637,
+ "eval_loss": 0.6287115812301636,
+ "eval_mean_token_accuracy": 0.8435265091061592,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.0873,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2244
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.895304460277965e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc9610bcf09b076b4eb42799410e3743e9276540
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json
@@ -0,0 +1,631 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.0,
+ "eval_steps": 500,
+ "global_step": 2618,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ },
+ {
+ "entropy": 0.3230800422454121,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.38153186440467834,
+ "learning_rate": 0.00012013673189135029,
+ "loss": 0.2727243995666504,
+ "mean_token_accuracy": 0.9120446022110756,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.2979305517673492,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.42468664050102234,
+ "learning_rate": 0.00011528664314752708,
+ "loss": 0.24437490463256836,
+ "mean_token_accuracy": 0.9198145979642868,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.29811844661831854,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.48722052574157715,
+ "learning_rate": 0.0001104136037788565,
+ "loss": 0.2472528076171875,
+ "mean_token_accuracy": 0.9198214167356491,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3036586672067642,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.4003150165081024,
+ "learning_rate": 0.00010552822420675757,
+ "loss": 0.2524623489379883,
+ "mean_token_accuracy": 0.9182902538776397,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.30276541873812673,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.5082385540008545,
+ "learning_rate": 0.00010064114172186765,
+ "loss": 0.2554252052307129,
+ "mean_token_accuracy": 0.9163929194211959,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.30480867981910703,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.46462953090667725,
+ "learning_rate": 9.57629973226994e-05,
+ "loss": 0.25483154296875,
+ "mean_token_accuracy": 0.916156811118126,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.2954915864765644,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5345046520233154,
+ "learning_rate": 9.090441254622432e-05,
+ "loss": 0.24575115203857423,
+ "mean_token_accuracy": 0.9198049437999726,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.37039283126592637,
+ "eval_loss": 0.6287115812301636,
+ "eval_mean_token_accuracy": 0.8435265091061592,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.0873,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2244
+ },
+ {
+ "entropy": 0.2953245359839815,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.4055909216403961,
+ "learning_rate": 8.607596634083136e-05,
+ "loss": 0.24116868972778321,
+ "mean_token_accuracy": 0.9220171468426482,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.229744790494442,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.4221028983592987,
+ "learning_rate": 8.128817203201665e-05,
+ "loss": 0.1732115364074707,
+ "mean_token_accuracy": 0.9427249735593796,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.23100735485553742,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.5276848673820496,
+ "learning_rate": 7.655145443095877e-05,
+ "loss": 0.1742458724975586,
+ "mean_token_accuracy": 0.9424393928050995,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2334547135233879,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.5443829298019409,
+ "learning_rate": 7.187612713582257e-05,
+ "loss": 0.17723684310913085,
+ "mean_token_accuracy": 0.9421556174755097,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.22672353580594062,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.4869251251220703,
+ "learning_rate": 6.727237007521524e-05,
+ "loss": 0.17469547271728517,
+ "mean_token_accuracy": 0.9419155931472778,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.23190354615449904,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.5983045697212219,
+ "learning_rate": 6.275020734269083e-05,
+ "loss": 0.17733327865600587,
+ "mean_token_accuracy": 0.9419048410654068,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.23502119958400727,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.5186291337013245,
+ "learning_rate": 5.831948537056545e-05,
+ "loss": 0.18074512481689453,
+ "mean_token_accuracy": 0.9402925485372543,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.23055340006947517,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.48175379633903503,
+ "learning_rate": 5.3989851490567374e-05,
+ "loss": 0.17573400497436523,
+ "mean_token_accuracy": 0.9420478469133378,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3201144814491272,
+ "eval_loss": 0.7185283899307251,
+ "eval_mean_token_accuracy": 0.8399944826960564,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.0868,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2618
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.380063552309576e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f8a9cf31bf0398cf6faaac0d8c73b5273551089
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json
@@ -0,0 +1,712 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 500,
+ "global_step": 2992,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ },
+ {
+ "entropy": 0.3230800422454121,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.38153186440467834,
+ "learning_rate": 0.00012013673189135029,
+ "loss": 0.2727243995666504,
+ "mean_token_accuracy": 0.9120446022110756,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.2979305517673492,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.42468664050102234,
+ "learning_rate": 0.00011528664314752708,
+ "loss": 0.24437490463256836,
+ "mean_token_accuracy": 0.9198145979642868,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.29811844661831854,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.48722052574157715,
+ "learning_rate": 0.0001104136037788565,
+ "loss": 0.2472528076171875,
+ "mean_token_accuracy": 0.9198214167356491,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3036586672067642,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.4003150165081024,
+ "learning_rate": 0.00010552822420675757,
+ "loss": 0.2524623489379883,
+ "mean_token_accuracy": 0.9182902538776397,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.30276541873812673,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.5082385540008545,
+ "learning_rate": 0.00010064114172186765,
+ "loss": 0.2554252052307129,
+ "mean_token_accuracy": 0.9163929194211959,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.30480867981910703,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.46462953090667725,
+ "learning_rate": 9.57629973226994e-05,
+ "loss": 0.25483154296875,
+ "mean_token_accuracy": 0.916156811118126,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.2954915864765644,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5345046520233154,
+ "learning_rate": 9.090441254622432e-05,
+ "loss": 0.24575115203857423,
+ "mean_token_accuracy": 0.9198049437999726,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.37039283126592637,
+ "eval_loss": 0.6287115812301636,
+ "eval_mean_token_accuracy": 0.8435265091061592,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.0873,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2244
+ },
+ {
+ "entropy": 0.2953245359839815,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.4055909216403961,
+ "learning_rate": 8.607596634083136e-05,
+ "loss": 0.24116868972778321,
+ "mean_token_accuracy": 0.9220171468426482,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.229744790494442,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.4221028983592987,
+ "learning_rate": 8.128817203201665e-05,
+ "loss": 0.1732115364074707,
+ "mean_token_accuracy": 0.9427249735593796,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.23100735485553742,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.5276848673820496,
+ "learning_rate": 7.655145443095877e-05,
+ "loss": 0.1742458724975586,
+ "mean_token_accuracy": 0.9424393928050995,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2334547135233879,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.5443829298019409,
+ "learning_rate": 7.187612713582257e-05,
+ "loss": 0.17723684310913085,
+ "mean_token_accuracy": 0.9421556174755097,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.22672353580594062,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.4869251251220703,
+ "learning_rate": 6.727237007521524e-05,
+ "loss": 0.17469547271728517,
+ "mean_token_accuracy": 0.9419155931472778,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.23190354615449904,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.5983045697212219,
+ "learning_rate": 6.275020734269083e-05,
+ "loss": 0.17733327865600587,
+ "mean_token_accuracy": 0.9419048410654068,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.23502119958400727,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.5186291337013245,
+ "learning_rate": 5.831948537056545e-05,
+ "loss": 0.18074512481689453,
+ "mean_token_accuracy": 0.9402925485372543,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.23055340006947517,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.48175379633903503,
+ "learning_rate": 5.3989851490567374e-05,
+ "loss": 0.17573400497436523,
+ "mean_token_accuracy": 0.9420478469133378,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3201144814491272,
+ "eval_loss": 0.7185283899307251,
+ "eval_mean_token_accuracy": 0.8399944826960564,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.0868,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2618
+ },
+ {
+ "entropy": 0.19884085278920452,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.4999229311943054,
+ "learning_rate": 4.977073292800337e-05,
+ "loss": 0.13776198387145996,
+ "mean_token_accuracy": 0.95504442369095,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.17382358580827714,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.44681516289711,
+ "learning_rate": 4.567131627517827e-05,
+ "loss": 0.1151345157623291,
+ "mean_token_accuracy": 0.9624496775865555,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.17993666499853134,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.47544270753860474,
+ "learning_rate": 4.1700527488762594e-05,
+ "loss": 0.12008686065673828,
+ "mean_token_accuracy": 0.9607802790403366,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.17229609042406083,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.4898432791233063,
+ "learning_rate": 3.786701245466089e-05,
+ "loss": 0.1164663314819336,
+ "mean_token_accuracy": 0.9622354304790497,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.16839693702757358,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.5927155613899231,
+ "learning_rate": 3.417911816269838e-05,
+ "loss": 0.1138334846496582,
+ "mean_token_accuracy": 0.9632772338390351,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.173521406725049,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.5407077670097351,
+ "learning_rate": 3.0644874532115575e-05,
+ "loss": 0.11670659065246582,
+ "mean_token_accuracy": 0.9622769457101822,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.17213394075632096,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.48647794127464294,
+ "learning_rate": 2.727197692744389e-05,
+ "loss": 0.11715221405029297,
+ "mean_token_accuracy": 0.9625237709283829,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2757616487890482,
+ "eval_loss": 0.8189995884895325,
+ "eval_mean_token_accuracy": 0.8390460336208343,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.0999,
+ "eval_samples_per_second": 32.566,
+ "eval_steps_per_second": 4.073,
+ "step": 2992
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.861809120790405e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5337300fd7964cff13080074b6e5a58680c6bba
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json
@@ -0,0 +1,803 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3366,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ },
+ {
+ "entropy": 0.3230800422454121,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.38153186440467834,
+ "learning_rate": 0.00012013673189135029,
+ "loss": 0.2727243995666504,
+ "mean_token_accuracy": 0.9120446022110756,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.2979305517673492,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.42468664050102234,
+ "learning_rate": 0.00011528664314752708,
+ "loss": 0.24437490463256836,
+ "mean_token_accuracy": 0.9198145979642868,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.29811844661831854,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.48722052574157715,
+ "learning_rate": 0.0001104136037788565,
+ "loss": 0.2472528076171875,
+ "mean_token_accuracy": 0.9198214167356491,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3036586672067642,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.4003150165081024,
+ "learning_rate": 0.00010552822420675757,
+ "loss": 0.2524623489379883,
+ "mean_token_accuracy": 0.9182902538776397,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.30276541873812673,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.5082385540008545,
+ "learning_rate": 0.00010064114172186765,
+ "loss": 0.2554252052307129,
+ "mean_token_accuracy": 0.9163929194211959,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.30480867981910703,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.46462953090667725,
+ "learning_rate": 9.57629973226994e-05,
+ "loss": 0.25483154296875,
+ "mean_token_accuracy": 0.916156811118126,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.2954915864765644,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5345046520233154,
+ "learning_rate": 9.090441254622432e-05,
+ "loss": 0.24575115203857423,
+ "mean_token_accuracy": 0.9198049437999726,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.37039283126592637,
+ "eval_loss": 0.6287115812301636,
+ "eval_mean_token_accuracy": 0.8435265091061592,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.0873,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2244
+ },
+ {
+ "entropy": 0.2953245359839815,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.4055909216403961,
+ "learning_rate": 8.607596634083136e-05,
+ "loss": 0.24116868972778321,
+ "mean_token_accuracy": 0.9220171468426482,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.229744790494442,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.4221028983592987,
+ "learning_rate": 8.128817203201665e-05,
+ "loss": 0.1732115364074707,
+ "mean_token_accuracy": 0.9427249735593796,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.23100735485553742,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.5276848673820496,
+ "learning_rate": 7.655145443095877e-05,
+ "loss": 0.1742458724975586,
+ "mean_token_accuracy": 0.9424393928050995,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2334547135233879,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.5443829298019409,
+ "learning_rate": 7.187612713582257e-05,
+ "loss": 0.17723684310913085,
+ "mean_token_accuracy": 0.9421556174755097,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.22672353580594062,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.4869251251220703,
+ "learning_rate": 6.727237007521524e-05,
+ "loss": 0.17469547271728517,
+ "mean_token_accuracy": 0.9419155931472778,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.23190354615449904,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.5983045697212219,
+ "learning_rate": 6.275020734269083e-05,
+ "loss": 0.17733327865600587,
+ "mean_token_accuracy": 0.9419048410654068,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.23502119958400727,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.5186291337013245,
+ "learning_rate": 5.831948537056545e-05,
+ "loss": 0.18074512481689453,
+ "mean_token_accuracy": 0.9402925485372543,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.23055340006947517,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.48175379633903503,
+ "learning_rate": 5.3989851490567374e-05,
+ "loss": 0.17573400497436523,
+ "mean_token_accuracy": 0.9420478469133378,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3201144814491272,
+ "eval_loss": 0.7185283899307251,
+ "eval_mean_token_accuracy": 0.8399944826960564,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.0868,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2618
+ },
+ {
+ "entropy": 0.19884085278920452,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.4999229311943054,
+ "learning_rate": 4.977073292800337e-05,
+ "loss": 0.13776198387145996,
+ "mean_token_accuracy": 0.95504442369095,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.17382358580827714,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.44681516289711,
+ "learning_rate": 4.567131627517827e-05,
+ "loss": 0.1151345157623291,
+ "mean_token_accuracy": 0.9624496775865555,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.17993666499853134,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.47544270753860474,
+ "learning_rate": 4.1700527488762594e-05,
+ "loss": 0.12008686065673828,
+ "mean_token_accuracy": 0.9607802790403366,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.17229609042406083,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.4898432791233063,
+ "learning_rate": 3.786701245466089e-05,
+ "loss": 0.1164663314819336,
+ "mean_token_accuracy": 0.9622354304790497,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.16839693702757358,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.5927155613899231,
+ "learning_rate": 3.417911816269838e-05,
+ "loss": 0.1138334846496582,
+ "mean_token_accuracy": 0.9632772338390351,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.173521406725049,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.5407077670097351,
+ "learning_rate": 3.0644874532115575e-05,
+ "loss": 0.11670659065246582,
+ "mean_token_accuracy": 0.9622769457101822,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.17213394075632096,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.48647794127464294,
+ "learning_rate": 2.727197692744389e-05,
+ "loss": 0.11715221405029297,
+ "mean_token_accuracy": 0.9625237709283829,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2757616487890482,
+ "eval_loss": 0.8189995884895325,
+ "eval_mean_token_accuracy": 0.8390460336208343,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.0999,
+ "eval_samples_per_second": 32.566,
+ "eval_steps_per_second": 4.073,
+ "step": 2992
+ },
+ {
+ "entropy": 0.17219420319253748,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.37176114320755005,
+ "learning_rate": 2.406776940283137e-05,
+ "loss": 0.11532307624816894,
+ "mean_token_accuracy": 0.9632835845754604,
+ "num_tokens": 7384150.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.14740404956042766,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.3686515688896179,
+ "learning_rate": 2.10392287113017e-05,
+ "loss": 0.08609914779663086,
+ "mean_token_accuracy": 0.9740558165311813,
+ "num_tokens": 7502454.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.14145817942917346,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.3443886339664459,
+ "learning_rate": 1.8192949113764877e-05,
+ "loss": 0.081221923828125,
+ "mean_token_accuracy": 0.9744218772649765,
+ "num_tokens": 7628419.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.13880868263542653,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.4160684049129486,
+ "learning_rate": 1.5535128020855533e-05,
+ "loss": 0.0840027904510498,
+ "mean_token_accuracy": 0.9742409408092498,
+ "num_tokens": 7751912.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.13735023334622384,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.5086039900779724,
+ "learning_rate": 1.3071552498861985e-05,
+ "loss": 0.08229084014892578,
+ "mean_token_accuracy": 0.9739874929189682,
+ "num_tokens": 7877804.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.1375646834075451,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.32038062810897827,
+ "learning_rate": 1.0807586669127857e-05,
+ "loss": 0.08256589889526367,
+ "mean_token_accuracy": 0.9740087121725083,
+ "num_tokens": 8003296.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.14328225292265415,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.34184616804122925,
+ "learning_rate": 8.748160028362413e-06,
+ "loss": 0.08445584297180175,
+ "mean_token_accuracy": 0.9736961781978607,
+ "num_tokens": 8123859.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.14006814867258072,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.610028088092804,
+ "learning_rate": 6.897756715290319e-06,
+ "loss": 0.08359557151794433,
+ "mean_token_accuracy": 0.9739799553155899,
+ "num_tokens": 8246971.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2503813248872757,
+ "eval_loss": 0.9325668215751648,
+ "eval_mean_token_accuracy": 0.8365286010503769,
+ "eval_num_tokens": 8283933.0,
+ "eval_runtime": 49.0602,
+ "eval_samples_per_second": 32.593,
+ "eval_steps_per_second": 4.077,
+ "step": 3366
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.344062788520479e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5366ce377642689b236590ff16365897360188f2
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 374,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.800445791636787e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0b077016f64f0c0638281a1f528441069be96a3
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json
@@ -0,0 +1,884 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3740,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ },
+ {
+ "entropy": 0.5179645954960524,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.39848020672798157,
+ "learning_rate": 0.00020314181351077757,
+ "loss": 0.4828613662719727,
+ "mean_token_accuracy": 0.8594950991447525,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.4724735128879547,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.27450788021087646,
+ "learning_rate": 0.00020135917063148916,
+ "loss": 0.43275066375732424,
+ "mean_token_accuracy": 0.8704854369163513,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.4807534040510654,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.3146534562110901,
+ "learning_rate": 0.00019936616518172531,
+ "loss": 0.44326435089111327,
+ "mean_token_accuracy": 0.8670724099874496,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.4744683504104614,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.3151724338531494,
+ "learning_rate": 0.0001971671366765428,
+ "loss": 0.44036914825439455,
+ "mean_token_accuracy": 0.8687405550479889,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4728820985555649,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.30318814516067505,
+ "learning_rate": 0.00019476687321991266,
+ "loss": 0.43707496643066407,
+ "mean_token_accuracy": 0.8694235664606095,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.4706392896175384,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.32992425560951233,
+ "learning_rate": 0.00019217060107923494,
+ "loss": 0.4379159545898437,
+ "mean_token_accuracy": 0.8695945852994919,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.46944593608379365,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.23659372329711914,
+ "learning_rate": 0.0001893839733058082,
+ "loss": 0.43395462036132815,
+ "mean_token_accuracy": 0.8707112389802932,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.47586817651987073,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.25799325108528137,
+ "learning_rate": 0.00018641305742603172,
+ "loss": 0.44020862579345704,
+ "mean_token_accuracy": 0.869452143907547,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.5092438699305057,
+ "eval_loss": 0.5377861857414246,
+ "eval_mean_token_accuracy": 0.8451382353901863,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.0653,
+ "eval_samples_per_second": 32.589,
+ "eval_steps_per_second": 4.076,
+ "step": 1122
+ },
+ {
+ "entropy": 0.44004572521556506,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.31287693977355957,
+ "learning_rate": 0.0001832643222301409,
+ "loss": 0.39778636932373046,
+ "mean_token_accuracy": 0.8794932232962714,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.42112498462200165,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.3411082327365875,
+ "learning_rate": 0.000179944623687242,
+ "loss": 0.3773982620239258,
+ "mean_token_accuracy": 0.8827895969152451,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.41285495966672897,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.31819090247154236,
+ "learning_rate": 0.0001764611900173143,
+ "loss": 0.3741728210449219,
+ "mean_token_accuracy": 0.8846501314640045,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.42110098838806154,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.3009640872478485,
+ "learning_rate": 0.00017282160595268327,
+ "loss": 0.3816569900512695,
+ "mean_token_accuracy": 0.8814844501018524,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4269171151518822,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.3651696741580963,
+ "learning_rate": 0.00016903379622323396,
+ "loss": 0.38763641357421874,
+ "mean_token_accuracy": 0.8813142603635788,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.4262796178460121,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.28190597891807556,
+ "learning_rate": 0.00016510600830132272,
+ "loss": 0.38563640594482423,
+ "mean_token_accuracy": 0.8813980734348297,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.4200029063224793,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.2584955096244812,
+ "learning_rate": 0.00016104679444395854,
+ "loss": 0.3829658508300781,
+ "mean_token_accuracy": 0.8822885012626648,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.47917599841952324,
+ "eval_loss": 0.5421923398971558,
+ "eval_mean_token_accuracy": 0.8468858799338341,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.0466,
+ "eval_samples_per_second": 32.602,
+ "eval_steps_per_second": 4.078,
+ "step": 1496
+ },
+ {
+ "entropy": 0.42061784803265273,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.3225401043891907,
+ "learning_rate": 0.0001568649930713548,
+ "loss": 0.38076282501220704,
+ "mean_token_accuracy": 0.8825460594109814,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.3564541311562061,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.42701128125190735,
+ "learning_rate": 0.00015256970952239702,
+ "loss": 0.3080678176879883,
+ "mean_token_accuracy": 0.9021131205558777,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.3623583456873894,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.7542179822921753,
+ "learning_rate": 0.00014817029622892904,
+ "loss": 0.31919103622436523,
+ "mean_token_accuracy": 0.8978805804252624,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.35793897867202756,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.3683936595916748,
+ "learning_rate": 0.0001436763323520266,
+ "loss": 0.31606245040893555,
+ "mean_token_accuracy": 0.8989632934331894,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.36605307310819624,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.3938419222831726,
+ "learning_rate": 0.00013909760292459586,
+ "loss": 0.3214926528930664,
+ "mean_token_accuracy": 0.897950147986412,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.3702411252260208,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.3159140646457672,
+ "learning_rate": 0.0001344440775457131,
+ "loss": 0.32606857299804687,
+ "mean_token_accuracy": 0.8971680045127869,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.3695961621403694,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.43663182854652405,
+ "learning_rate": 0.00012972588867309488,
+ "loss": 0.324642448425293,
+ "mean_token_accuracy": 0.8974496972560883,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.3658722630143166,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.3189752995967865,
+ "learning_rate": 0.0001249533095609642,
+ "loss": 0.3198036575317383,
+ "mean_token_accuracy": 0.8985732847452164,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.43428498685359956,
+ "eval_loss": 0.5698739290237427,
+ "eval_mean_token_accuracy": 0.8450208070874214,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.0399,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 1870
+ },
+ {
+ "entropy": 0.3230800422454121,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.38153186440467834,
+ "learning_rate": 0.00012013673189135029,
+ "loss": 0.2727243995666504,
+ "mean_token_accuracy": 0.9120446022110756,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.2979305517673492,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.42468664050102234,
+ "learning_rate": 0.00011528664314752708,
+ "loss": 0.24437490463256836,
+ "mean_token_accuracy": 0.9198145979642868,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.29811844661831854,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.48722052574157715,
+ "learning_rate": 0.0001104136037788565,
+ "loss": 0.2472528076171875,
+ "mean_token_accuracy": 0.9198214167356491,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.3036586672067642,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.4003150165081024,
+ "learning_rate": 0.00010552822420675757,
+ "loss": 0.2524623489379883,
+ "mean_token_accuracy": 0.9182902538776397,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.30276541873812673,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.5082385540008545,
+ "learning_rate": 0.00010064114172186765,
+ "loss": 0.2554252052307129,
+ "mean_token_accuracy": 0.9163929194211959,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.30480867981910703,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.46462953090667725,
+ "learning_rate": 9.57629973226994e-05,
+ "loss": 0.25483154296875,
+ "mean_token_accuracy": 0.916156811118126,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.2954915864765644,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5345046520233154,
+ "learning_rate": 9.090441254622432e-05,
+ "loss": 0.24575115203857423,
+ "mean_token_accuracy": 0.9198049437999726,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.37039283126592637,
+ "eval_loss": 0.6287115812301636,
+ "eval_mean_token_accuracy": 0.8435265091061592,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.0873,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2244
+ },
+ {
+ "entropy": 0.2953245359839815,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.4055909216403961,
+ "learning_rate": 8.607596634083136e-05,
+ "loss": 0.24116868972778321,
+ "mean_token_accuracy": 0.9220171468426482,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.229744790494442,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.4221028983592987,
+ "learning_rate": 8.128817203201665e-05,
+ "loss": 0.1732115364074707,
+ "mean_token_accuracy": 0.9427249735593796,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.23100735485553742,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.5276848673820496,
+ "learning_rate": 7.655145443095877e-05,
+ "loss": 0.1742458724975586,
+ "mean_token_accuracy": 0.9424393928050995,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.2334547135233879,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.5443829298019409,
+ "learning_rate": 7.187612713582257e-05,
+ "loss": 0.17723684310913085,
+ "mean_token_accuracy": 0.9421556174755097,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.22672353580594062,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.4869251251220703,
+ "learning_rate": 6.727237007521524e-05,
+ "loss": 0.17469547271728517,
+ "mean_token_accuracy": 0.9419155931472778,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.23190354615449904,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.5983045697212219,
+ "learning_rate": 6.275020734269083e-05,
+ "loss": 0.17733327865600587,
+ "mean_token_accuracy": 0.9419048410654068,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.23502119958400727,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.5186291337013245,
+ "learning_rate": 5.831948537056545e-05,
+ "loss": 0.18074512481689453,
+ "mean_token_accuracy": 0.9402925485372543,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.23055340006947517,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.48175379633903503,
+ "learning_rate": 5.3989851490567374e-05,
+ "loss": 0.17573400497436523,
+ "mean_token_accuracy": 0.9420478469133378,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.3201144814491272,
+ "eval_loss": 0.7185283899307251,
+ "eval_mean_token_accuracy": 0.8399944826960564,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.0868,
+ "eval_samples_per_second": 32.575,
+ "eval_steps_per_second": 4.074,
+ "step": 2618
+ },
+ {
+ "entropy": 0.19884085278920452,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.4999229311943054,
+ "learning_rate": 4.977073292800337e-05,
+ "loss": 0.13776198387145996,
+ "mean_token_accuracy": 0.95504442369095,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.17382358580827714,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.44681516289711,
+ "learning_rate": 4.567131627517827e-05,
+ "loss": 0.1151345157623291,
+ "mean_token_accuracy": 0.9624496775865555,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.17993666499853134,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.47544270753860474,
+ "learning_rate": 4.1700527488762594e-05,
+ "loss": 0.12008686065673828,
+ "mean_token_accuracy": 0.9607802790403366,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.17229609042406083,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.4898432791233063,
+ "learning_rate": 3.786701245466089e-05,
+ "loss": 0.1164663314819336,
+ "mean_token_accuracy": 0.9622354304790497,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.16839693702757358,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.5927155613899231,
+ "learning_rate": 3.417911816269838e-05,
+ "loss": 0.1138334846496582,
+ "mean_token_accuracy": 0.9632772338390351,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.173521406725049,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.5407077670097351,
+ "learning_rate": 3.0644874532115575e-05,
+ "loss": 0.11670659065246582,
+ "mean_token_accuracy": 0.9622769457101822,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.17213394075632096,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.48647794127464294,
+ "learning_rate": 2.727197692744389e-05,
+ "loss": 0.11715221405029297,
+ "mean_token_accuracy": 0.9625237709283829,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2757616487890482,
+ "eval_loss": 0.8189995884895325,
+ "eval_mean_token_accuracy": 0.8390460336208343,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.0999,
+ "eval_samples_per_second": 32.566,
+ "eval_steps_per_second": 4.073,
+ "step": 2992
+ },
+ {
+ "entropy": 0.17219420319253748,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.37176114320755005,
+ "learning_rate": 2.406776940283137e-05,
+ "loss": 0.11532307624816894,
+ "mean_token_accuracy": 0.9632835845754604,
+ "num_tokens": 7384150.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.14740404956042766,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.3686515688896179,
+ "learning_rate": 2.10392287113017e-05,
+ "loss": 0.08609914779663086,
+ "mean_token_accuracy": 0.9740558165311813,
+ "num_tokens": 7502454.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.14145817942917346,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.3443886339664459,
+ "learning_rate": 1.8192949113764877e-05,
+ "loss": 0.081221923828125,
+ "mean_token_accuracy": 0.9744218772649765,
+ "num_tokens": 7628419.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.13880868263542653,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.4160684049129486,
+ "learning_rate": 1.5535128020855533e-05,
+ "loss": 0.0840027904510498,
+ "mean_token_accuracy": 0.9742409408092498,
+ "num_tokens": 7751912.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.13735023334622384,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.5086039900779724,
+ "learning_rate": 1.3071552498861985e-05,
+ "loss": 0.08229084014892578,
+ "mean_token_accuracy": 0.9739874929189682,
+ "num_tokens": 7877804.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.1375646834075451,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.32038062810897827,
+ "learning_rate": 1.0807586669127857e-05,
+ "loss": 0.08256589889526367,
+ "mean_token_accuracy": 0.9740087121725083,
+ "num_tokens": 8003296.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.14328225292265415,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.34184616804122925,
+ "learning_rate": 8.748160028362413e-06,
+ "loss": 0.08445584297180175,
+ "mean_token_accuracy": 0.9736961781978607,
+ "num_tokens": 8123859.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.14006814867258072,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.610028088092804,
+ "learning_rate": 6.897756715290319e-06,
+ "loss": 0.08359557151794433,
+ "mean_token_accuracy": 0.9739799553155899,
+ "num_tokens": 8246971.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.2503813248872757,
+ "eval_loss": 0.9325668215751648,
+ "eval_mean_token_accuracy": 0.8365286010503769,
+ "eval_num_tokens": 8283933.0,
+ "eval_runtime": 49.0602,
+ "eval_samples_per_second": 32.593,
+ "eval_steps_per_second": 4.077,
+ "step": 3366
+ },
+ {
+ "entropy": 0.13715504267902084,
+ "epoch": 9.09103078982597,
+ "grad_norm": 0.32644304633140564,
+ "learning_rate": 5.260405747011887e-06,
+ "loss": 0.0776783800125122,
+ "mean_token_accuracy": 0.9762971684186146,
+ "num_tokens": 8361698.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.13289645686745644,
+ "epoch": 9.224899598393574,
+ "grad_norm": 0.3593284785747528,
+ "learning_rate": 3.839672246332384e-06,
+ "loss": 0.07479411125183105,
+ "mean_token_accuracy": 0.977249429821968,
+ "num_tokens": 8479562.0,
+ "step": 3450
+ },
+ {
+ "entropy": 0.1306164874136448,
+ "epoch": 9.358768406961179,
+ "grad_norm": 0.3294520378112793,
+ "learning_rate": 2.6386496791621076e-06,
+ "loss": 0.07242131233215332,
+ "mean_token_accuracy": 0.9776324343681335,
+ "num_tokens": 8601271.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.12282529093325138,
+ "epoch": 9.492637215528783,
+ "grad_norm": 0.28185349702835083,
+ "learning_rate": 1.6599531188889682e-06,
+ "loss": 0.06905817031860352,
+ "mean_token_accuracy": 0.9788999700546265,
+ "num_tokens": 8730858.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.12793996281921863,
+ "epoch": 9.626506024096386,
+ "grad_norm": 0.39180248975753784,
+ "learning_rate": 9.057135523899838e-07,
+ "loss": 0.0722837495803833,
+ "mean_token_accuracy": 0.9780756998062133,
+ "num_tokens": 8855189.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.1260003688186407,
+ "epoch": 9.76037483266399,
+ "grad_norm": 0.354835569858551,
+ "learning_rate": 3.775732400792635e-07,
+ "loss": 0.07001883029937744,
+ "mean_token_accuracy": 0.9790224677324295,
+ "num_tokens": 8979510.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.12340887859463692,
+ "epoch": 9.894243641231594,
+ "grad_norm": 0.3675302267074585,
+ "learning_rate": 7.668214009545532e-08,
+ "loss": 0.06868332386016845,
+ "mean_token_accuracy": 0.9791393029689789,
+ "num_tokens": 9111678.0,
+ "step": 3700
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.24003039725124836,
+ "eval_loss": 0.9899837374687195,
+ "eval_mean_token_accuracy": 0.8358298748731613,
+ "eval_num_tokens": 9204370.0,
+ "eval_runtime": 49.1056,
+ "eval_samples_per_second": 32.562,
+ "eval_steps_per_second": 4.073,
+ "step": 3740
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.829323098596997e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62108c4c8ad6a925598383a7b5a5345aa9b946e6
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.0652985372477836,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1747272bf5443b8411defd3f7cee01d9dd5990a
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json
@@ -0,0 +1,196 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 748,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.8254910862445832,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 0.8958511352539062,
+ "learning_rate": 2.7446846603309888e-05,
+ "loss": 1.722928009033203,
+ "mean_token_accuracy": 0.6557231456041336,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.8272530055046081,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.6162250638008118,
+ "learning_rate": 5.5453832933217935e-05,
+ "loss": 0.7761137390136719,
+ "mean_token_accuracy": 0.795179500579834,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6629777508974075,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.43394413590431213,
+ "learning_rate": 8.346081926312598e-05,
+ "loss": 0.6302793884277343,
+ "mean_token_accuracy": 0.8262274640798569,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.6184763962030411,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.4373406767845154,
+ "learning_rate": 0.00011146780559303404,
+ "loss": 0.5863075256347656,
+ "mean_token_accuracy": 0.8357332807779312,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.6116772794723511,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.6781982183456421,
+ "learning_rate": 0.00013947479192294207,
+ "loss": 0.5783074951171875,
+ "mean_token_accuracy": 0.8368694090843201,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5848374783992767,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.3493351340293884,
+ "learning_rate": 0.00016748177825285014,
+ "loss": 0.5531037902832031,
+ "mean_token_accuracy": 0.8434528934955597,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.57043960750103,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.3953551650047302,
+ "learning_rate": 0.00019548876458275817,
+ "loss": 0.5387083053588867,
+ "mean_token_accuracy": 0.844996885061264,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.6065685012936592,
+ "eval_loss": 0.5825985074043274,
+ "eval_mean_token_accuracy": 0.8326057174801826,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.017,
+ "eval_samples_per_second": 32.621,
+ "eval_steps_per_second": 4.08,
+ "step": 374
+ },
+ {
+ "entropy": 0.5623676343397661,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.37431156635284424,
+ "learning_rate": 0.00020946374495076317,
+ "loss": 0.526231803894043,
+ "mean_token_accuracy": 0.8500469212580208,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5420066699385643,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.3302382826805115,
+ "learning_rate": 0.00020923573570386192,
+ "loss": 0.5088261032104492,
+ "mean_token_accuracy": 0.8518517130613327,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5347033357620239,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.281392902135849,
+ "learning_rate": 0.00020878021367110025,
+ "loss": 0.5044374084472656,
+ "mean_token_accuracy": 0.8548643559217453,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5319472518563271,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.3298404812812805,
+ "learning_rate": 0.00020809817069357935,
+ "loss": 0.5011252593994141,
+ "mean_token_accuracy": 0.8546603417396545,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5234624195098877,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.26496022939682007,
+ "learning_rate": 0.00020719109183285305,
+ "loss": 0.49288436889648435,
+ "mean_token_accuracy": 0.8559961414337158,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5170091751217842,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.2679271996021271,
+ "learning_rate": 0.00020606095213739626,
+ "loss": 0.4867116165161133,
+ "mean_token_accuracy": 0.8584025889635086,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.519580851495266,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.2783832848072052,
+ "learning_rate": 0.0002047102123421885,
+ "loss": 0.4858899688720703,
+ "mean_token_accuracy": 0.8601382756233216,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5516054129600525,
+ "eval_loss": 0.5468233227729797,
+ "eval_mean_token_accuracy": 0.8418151989579201,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.0401,
+ "eval_samples_per_second": 32.606,
+ "eval_steps_per_second": 4.078,
+ "step": 748
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.650338585790669e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..18d0a258ad2321bd2f63d9fd1f15f08b504f6542
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/README.md
@@ -0,0 +1,58 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: transformers
+model_name: Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2
+
+This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ed11jhv7)
+
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.29.0
+- Transformers: 5.5.4
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+
+```bibtex
+@software{vonwerra2020trl,
+ title = {{TRL: Transformers Reinforcement Learning}},
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+ license = {Apache-2.0},
+ url = {https://github.com/huggingface/trl},
+ year = {2020}
+}
+```
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..58f07f6c680b15779a532dc3b2429ba36fd336b7
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json
@@ -0,0 +1,287 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 1122,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.529510818991954e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d080195d92ab55de2f2d8b4ab836604414caae55
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json
@@ -0,0 +1,368 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 1496,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.0412494723875738e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3ab89aba0dc0e69061b554799e0589717afef72
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json
@@ -0,0 +1,459 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 1870,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.545178197920768e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc42c5a0a68bd40d57e512b9172007bee433d499
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json
@@ -0,0 +1,540 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 2244,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2288022293436407,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.4002731442451477,
+ "learning_rate": 0.0001189940028912678,
+ "loss": 0.17887537002563478,
+ "mean_token_accuracy": 0.9409801201386885,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.192287794649601,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.4603167474269867,
+ "learning_rate": 0.00011419004772352316,
+ "loss": 0.14474411010742189,
+ "mean_token_accuracy": 0.9518448287248611,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.19072901770472528,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.6232134103775024,
+ "learning_rate": 0.00010936336023505987,
+ "loss": 0.14428988456726075,
+ "mean_token_accuracy": 0.9511868554353714,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.19722454741597176,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.49368390440940857,
+ "learning_rate": 0.00010452444992199237,
+ "loss": 0.15026931762695311,
+ "mean_token_accuracy": 0.9493078935146332,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.1921817621588707,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.6033351421356201,
+ "learning_rate": 9.9683852894076e-05,
+ "loss": 0.15000157356262206,
+ "mean_token_accuracy": 0.9497224026918412,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.19455582827329634,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.4534570276737213,
+ "learning_rate": 9.485210893367247e-05,
+ "loss": 0.14963313102722167,
+ "mean_token_accuracy": 0.94916872382164,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.18738240271806716,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5815815329551697,
+ "learning_rate": 9.003973854671866e-05,
+ "loss": 0.14579124450683595,
+ "mean_token_accuracy": 0.9498835545778275,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.29356860227882864,
+ "eval_loss": 0.7589722275733948,
+ "eval_mean_token_accuracy": 0.8418136316537858,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.8769,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 2244
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.0504018349848576e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb2a6299585cd25931e76a4a84d0252e0b22078d
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json
@@ -0,0 +1,631 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.0,
+ "eval_steps": 500,
+ "global_step": 2618,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2288022293436407,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.4002731442451477,
+ "learning_rate": 0.0001189940028912678,
+ "loss": 0.17887537002563478,
+ "mean_token_accuracy": 0.9409801201386885,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.192287794649601,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.4603167474269867,
+ "learning_rate": 0.00011419004772352316,
+ "loss": 0.14474411010742189,
+ "mean_token_accuracy": 0.9518448287248611,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.19072901770472528,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.6232134103775024,
+ "learning_rate": 0.00010936336023505987,
+ "loss": 0.14428988456726075,
+ "mean_token_accuracy": 0.9511868554353714,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.19722454741597176,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.49368390440940857,
+ "learning_rate": 0.00010452444992199237,
+ "loss": 0.15026931762695311,
+ "mean_token_accuracy": 0.9493078935146332,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.1921817621588707,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.6033351421356201,
+ "learning_rate": 9.9683852894076e-05,
+ "loss": 0.15000157356262206,
+ "mean_token_accuracy": 0.9497224026918412,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.19455582827329634,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.4534570276737213,
+ "learning_rate": 9.485210893367247e-05,
+ "loss": 0.14963313102722167,
+ "mean_token_accuracy": 0.94916872382164,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.18738240271806716,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5815815329551697,
+ "learning_rate": 9.003973854671866e-05,
+ "loss": 0.14579124450683595,
+ "mean_token_accuracy": 0.9498835545778275,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.29356860227882864,
+ "eval_loss": 0.7589722275733948,
+ "eval_mean_token_accuracy": 0.8418136316537858,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.8769,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 2244
+ },
+ {
+ "entropy": 0.19056345296628546,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.29291921854019165,
+ "learning_rate": 8.525722005566732e-05,
+ "loss": 0.14140020370483397,
+ "mean_token_accuracy": 0.9524310213146787,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.13221844218671322,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.34443414211273193,
+ "learning_rate": 8.051496678427703e-05,
+ "loss": 0.0891877555847168,
+ "mean_token_accuracy": 0.97141546189785,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.13209220491349696,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.4142569899559021,
+ "learning_rate": 7.58233043839285e-05,
+ "loss": 0.08825708389282226,
+ "mean_token_accuracy": 0.9711007869243622,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.14063000075519086,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.47584882378578186,
+ "learning_rate": 7.119244835083612e-05,
+ "loss": 0.09473857879638672,
+ "mean_token_accuracy": 0.9696242707967758,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.13315705463290214,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3179854452610016,
+ "learning_rate": 6.66324817831086e-05,
+ "loss": 0.0911135196685791,
+ "mean_token_accuracy": 0.9703826290369034,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.13554719373583793,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.49769842624664307,
+ "learning_rate": 6.215333342608944e-05,
+ "loss": 0.09153086662292481,
+ "mean_token_accuracy": 0.9705063331127167,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.13915603026747703,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.467375785112381,
+ "learning_rate": 5.7764756053780784e-05,
+ "loss": 0.09427680969238281,
+ "mean_token_accuracy": 0.9695158433914185,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.13692217327654363,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.36858609318733215,
+ "learning_rate": 5.3476305233422516e-05,
+ "loss": 0.09176054954528809,
+ "mean_token_accuracy": 0.969379341006279,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.24440797246992588,
+ "eval_loss": 0.891926646232605,
+ "eval_mean_token_accuracy": 0.8400790172815323,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.9579,
+ "eval_samples_per_second": 31.987,
+ "eval_steps_per_second": 4.003,
+ "step": 2618
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.561128787588956e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..538149a07655e785730cc0a4f2d3cd515f54dc34
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json
@@ -0,0 +1,712 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 500,
+ "global_step": 2992,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2288022293436407,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.4002731442451477,
+ "learning_rate": 0.0001189940028912678,
+ "loss": 0.17887537002563478,
+ "mean_token_accuracy": 0.9409801201386885,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.192287794649601,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.4603167474269867,
+ "learning_rate": 0.00011419004772352316,
+ "loss": 0.14474411010742189,
+ "mean_token_accuracy": 0.9518448287248611,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.19072901770472528,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.6232134103775024,
+ "learning_rate": 0.00010936336023505987,
+ "loss": 0.14428988456726075,
+ "mean_token_accuracy": 0.9511868554353714,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.19722454741597176,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.49368390440940857,
+ "learning_rate": 0.00010452444992199237,
+ "loss": 0.15026931762695311,
+ "mean_token_accuracy": 0.9493078935146332,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.1921817621588707,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.6033351421356201,
+ "learning_rate": 9.9683852894076e-05,
+ "loss": 0.15000157356262206,
+ "mean_token_accuracy": 0.9497224026918412,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.19455582827329634,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.4534570276737213,
+ "learning_rate": 9.485210893367247e-05,
+ "loss": 0.14963313102722167,
+ "mean_token_accuracy": 0.94916872382164,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.18738240271806716,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5815815329551697,
+ "learning_rate": 9.003973854671866e-05,
+ "loss": 0.14579124450683595,
+ "mean_token_accuracy": 0.9498835545778275,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.29356860227882864,
+ "eval_loss": 0.7589722275733948,
+ "eval_mean_token_accuracy": 0.8418136316537858,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.8769,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 2244
+ },
+ {
+ "entropy": 0.19056345296628546,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.29291921854019165,
+ "learning_rate": 8.525722005566732e-05,
+ "loss": 0.14140020370483397,
+ "mean_token_accuracy": 0.9524310213146787,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.13221844218671322,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.34443414211273193,
+ "learning_rate": 8.051496678427703e-05,
+ "loss": 0.0891877555847168,
+ "mean_token_accuracy": 0.97141546189785,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.13209220491349696,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.4142569899559021,
+ "learning_rate": 7.58233043839285e-05,
+ "loss": 0.08825708389282226,
+ "mean_token_accuracy": 0.9711007869243622,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.14063000075519086,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.47584882378578186,
+ "learning_rate": 7.119244835083612e-05,
+ "loss": 0.09473857879638672,
+ "mean_token_accuracy": 0.9696242707967758,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.13315705463290214,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3179854452610016,
+ "learning_rate": 6.66324817831086e-05,
+ "loss": 0.0911135196685791,
+ "mean_token_accuracy": 0.9703826290369034,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.13554719373583793,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.49769842624664307,
+ "learning_rate": 6.215333342608944e-05,
+ "loss": 0.09153086662292481,
+ "mean_token_accuracy": 0.9705063331127167,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.13915603026747703,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.467375785112381,
+ "learning_rate": 5.7764756053780784e-05,
+ "loss": 0.09427680969238281,
+ "mean_token_accuracy": 0.9695158433914185,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.13692217327654363,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.36858609318733215,
+ "learning_rate": 5.3476305233422516e-05,
+ "loss": 0.09176054954528809,
+ "mean_token_accuracy": 0.969379341006279,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.24440797246992588,
+ "eval_loss": 0.891926646232605,
+ "eval_mean_token_accuracy": 0.8400790172815323,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.9579,
+ "eval_samples_per_second": 31.987,
+ "eval_steps_per_second": 4.003,
+ "step": 2618
+ },
+ {
+ "entropy": 0.1212329932234504,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.2522813677787781,
+ "learning_rate": 4.929731851946405e-05,
+ "loss": 0.07568974018096924,
+ "mean_token_accuracy": 0.975432159924748,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.10857273273169994,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2335294634103775,
+ "learning_rate": 4.5236895122230764e-05,
+ "loss": 0.06618132591247558,
+ "mean_token_accuracy": 0.9785620093345642,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.1136517857015133,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.3027023375034332,
+ "learning_rate": 4.130387609555471e-05,
+ "loss": 0.06987609386444092,
+ "mean_token_accuracy": 0.9772803634405136,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.10837352603673935,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.3191539943218231,
+ "learning_rate": 3.750682508650807e-05,
+ "loss": 0.06725080013275146,
+ "mean_token_accuracy": 0.9786273115873336,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.10379995822906495,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.21721133589744568,
+ "learning_rate": 3.3854009689154384e-05,
+ "loss": 0.06510573387145996,
+ "mean_token_accuracy": 0.9790040755271912,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.10860319800674915,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.35063880681991577,
+ "learning_rate": 3.0353383442917245e-05,
+ "loss": 0.06781518936157227,
+ "mean_token_accuracy": 0.9782285010814666,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.11041728757321835,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.27241161465644836,
+ "learning_rate": 2.7012568514763283e-05,
+ "loss": 0.06919246673583984,
+ "mean_token_accuracy": 0.9774098896980286,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.216568651124835,
+ "eval_loss": 1.0052591562271118,
+ "eval_mean_token_accuracy": 0.841527444422245,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.8931,
+ "eval_samples_per_second": 32.028,
+ "eval_steps_per_second": 4.009,
+ "step": 2992
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.0686807864376934e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdbeb61a6495ab85e0b69bb9efab68059de8baaf
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json
@@ -0,0 +1,803 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.0,
+ "eval_steps": 500,
+ "global_step": 3366,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2288022293436407,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.4002731442451477,
+ "learning_rate": 0.0001189940028912678,
+ "loss": 0.17887537002563478,
+ "mean_token_accuracy": 0.9409801201386885,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.192287794649601,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.4603167474269867,
+ "learning_rate": 0.00011419004772352316,
+ "loss": 0.14474411010742189,
+ "mean_token_accuracy": 0.9518448287248611,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.19072901770472528,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.6232134103775024,
+ "learning_rate": 0.00010936336023505987,
+ "loss": 0.14428988456726075,
+ "mean_token_accuracy": 0.9511868554353714,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.19722454741597176,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.49368390440940857,
+ "learning_rate": 0.00010452444992199237,
+ "loss": 0.15026931762695311,
+ "mean_token_accuracy": 0.9493078935146332,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.1921817621588707,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.6033351421356201,
+ "learning_rate": 9.9683852894076e-05,
+ "loss": 0.15000157356262206,
+ "mean_token_accuracy": 0.9497224026918412,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.19455582827329634,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.4534570276737213,
+ "learning_rate": 9.485210893367247e-05,
+ "loss": 0.14963313102722167,
+ "mean_token_accuracy": 0.94916872382164,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.18738240271806716,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5815815329551697,
+ "learning_rate": 9.003973854671866e-05,
+ "loss": 0.14579124450683595,
+ "mean_token_accuracy": 0.9498835545778275,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.29356860227882864,
+ "eval_loss": 0.7589722275733948,
+ "eval_mean_token_accuracy": 0.8418136316537858,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.8769,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 2244
+ },
+ {
+ "entropy": 0.19056345296628546,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.29291921854019165,
+ "learning_rate": 8.525722005566732e-05,
+ "loss": 0.14140020370483397,
+ "mean_token_accuracy": 0.9524310213146787,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.13221844218671322,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.34443414211273193,
+ "learning_rate": 8.051496678427703e-05,
+ "loss": 0.0891877555847168,
+ "mean_token_accuracy": 0.97141546189785,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.13209220491349696,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.4142569899559021,
+ "learning_rate": 7.58233043839285e-05,
+ "loss": 0.08825708389282226,
+ "mean_token_accuracy": 0.9711007869243622,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.14063000075519086,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.47584882378578186,
+ "learning_rate": 7.119244835083612e-05,
+ "loss": 0.09473857879638672,
+ "mean_token_accuracy": 0.9696242707967758,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.13315705463290214,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3179854452610016,
+ "learning_rate": 6.66324817831086e-05,
+ "loss": 0.0911135196685791,
+ "mean_token_accuracy": 0.9703826290369034,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.13554719373583793,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.49769842624664307,
+ "learning_rate": 6.215333342608944e-05,
+ "loss": 0.09153086662292481,
+ "mean_token_accuracy": 0.9705063331127167,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.13915603026747703,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.467375785112381,
+ "learning_rate": 5.7764756053780784e-05,
+ "loss": 0.09427680969238281,
+ "mean_token_accuracy": 0.9695158433914185,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.13692217327654363,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.36858609318733215,
+ "learning_rate": 5.3476305233422516e-05,
+ "loss": 0.09176054954528809,
+ "mean_token_accuracy": 0.969379341006279,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.24440797246992588,
+ "eval_loss": 0.891926646232605,
+ "eval_mean_token_accuracy": 0.8400790172815323,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.9579,
+ "eval_samples_per_second": 31.987,
+ "eval_steps_per_second": 4.003,
+ "step": 2618
+ },
+ {
+ "entropy": 0.1212329932234504,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.2522813677787781,
+ "learning_rate": 4.929731851946405e-05,
+ "loss": 0.07568974018096924,
+ "mean_token_accuracy": 0.975432159924748,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.10857273273169994,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2335294634103775,
+ "learning_rate": 4.5236895122230764e-05,
+ "loss": 0.06618132591247558,
+ "mean_token_accuracy": 0.9785620093345642,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.1136517857015133,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.3027023375034332,
+ "learning_rate": 4.130387609555471e-05,
+ "loss": 0.06987609386444092,
+ "mean_token_accuracy": 0.9772803634405136,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.10837352603673935,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.3191539943218231,
+ "learning_rate": 3.750682508650807e-05,
+ "loss": 0.06725080013275146,
+ "mean_token_accuracy": 0.9786273115873336,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.10379995822906495,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.21721133589744568,
+ "learning_rate": 3.3854009689154384e-05,
+ "loss": 0.06510573387145996,
+ "mean_token_accuracy": 0.9790040755271912,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.10860319800674915,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.35063880681991577,
+ "learning_rate": 3.0353383442917245e-05,
+ "loss": 0.06781518936157227,
+ "mean_token_accuracy": 0.9782285010814666,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.11041728757321835,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.27241161465644836,
+ "learning_rate": 2.7012568514763283e-05,
+ "loss": 0.06919246673583984,
+ "mean_token_accuracy": 0.9774098896980286,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.216568651124835,
+ "eval_loss": 1.0052591562271118,
+ "eval_mean_token_accuracy": 0.841527444422245,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.8931,
+ "eval_samples_per_second": 32.028,
+ "eval_steps_per_second": 4.009,
+ "step": 2992
+ },
+ {
+ "entropy": 0.11074423059971646,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.12072166800498962,
+ "learning_rate": 2.3838839102906225e-05,
+ "loss": 0.07017123222351074,
+ "mean_token_accuracy": 0.9776547671568514,
+ "num_tokens": 7384150.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.10425103880465031,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.20287242531776428,
+ "learning_rate": 2.0839105598168276e-05,
+ "loss": 0.06177260398864746,
+ "mean_token_accuracy": 0.9801955896615983,
+ "num_tokens": 7502454.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.10014630381017924,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.1157577857375145,
+ "learning_rate": 1.8019899537486024e-05,
+ "loss": 0.05763424873352051,
+ "mean_token_accuracy": 0.9802741694450379,
+ "num_tokens": 7628419.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.09626397963613272,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.12889772653579712,
+ "learning_rate": 1.5387359382322228e-05,
+ "loss": 0.05830557346343994,
+ "mean_token_accuracy": 0.9807974797487259,
+ "num_tokens": 7751912.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.09667510379105806,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.18801453709602356,
+ "learning_rate": 1.2947217152949136e-05,
+ "loss": 0.058124661445617676,
+ "mean_token_accuracy": 0.98047631919384,
+ "num_tokens": 7877804.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.09806526392698288,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.11081992089748383,
+ "learning_rate": 1.0704785947705815e-05,
+ "loss": 0.05876843929290772,
+ "mean_token_accuracy": 0.9807141083478927,
+ "num_tokens": 8003296.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.1030188063904643,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.11520951986312866,
+ "learning_rate": 8.664948374404545e-06,
+ "loss": 0.06109299659729004,
+ "mean_token_accuracy": 0.9795061159133911,
+ "num_tokens": 8123859.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.10020156983286142,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.12751302123069763,
+ "learning_rate": 6.832145919075181e-06,
+ "loss": 0.05992648124694824,
+ "mean_token_accuracy": 0.9798818999528884,
+ "num_tokens": 8246971.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.20119483806192875,
+ "eval_loss": 1.0950454473495483,
+ "eval_mean_token_accuracy": 0.8413037645816803,
+ "eval_num_tokens": 8283933.0,
+ "eval_runtime": 49.8882,
+ "eval_samples_per_second": 32.032,
+ "eval_steps_per_second": 4.009,
+ "step": 3366
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.5767681026955366e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..769cd507cf763a85050736aef0cfe1e7f2e6e158
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json
@@ -0,0 +1,115 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 374,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.057598899339366e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa79e9718e46078d448f64116375ce19a3293a3b
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json
@@ -0,0 +1,884 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 500,
+ "global_step": 3740,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ },
+ {
+ "entropy": 0.5177121743409321,
+ "epoch": 2.005354752342704,
+ "grad_norm": 0.33286598324775696,
+ "learning_rate": 0.00020120954818464854,
+ "loss": 0.4759817886352539,
+ "mean_token_accuracy": 0.8611076672871908,
+ "num_tokens": 1846410.0,
+ "step": 750
+ },
+ {
+ "entropy": 0.43643771946430204,
+ "epoch": 2.139223560910308,
+ "grad_norm": 0.3718855679035187,
+ "learning_rate": 0.00019944386163239588,
+ "loss": 0.3936069107055664,
+ "mean_token_accuracy": 0.8780064672231674,
+ "num_tokens": 1971547.0,
+ "step": 800
+ },
+ {
+ "entropy": 0.44890178814530374,
+ "epoch": 2.2730923694779115,
+ "grad_norm": 0.4402499496936798,
+ "learning_rate": 0.0001974698134581373,
+ "loss": 0.40427154541015625,
+ "mean_token_accuracy": 0.8763552361726761,
+ "num_tokens": 2088560.0,
+ "step": 850
+ },
+ {
+ "entropy": 0.44539201706647874,
+ "epoch": 2.4069611780455156,
+ "grad_norm": 0.437489777803421,
+ "learning_rate": 0.00019529170189988115,
+ "loss": 0.4049137878417969,
+ "mean_token_accuracy": 0.8759620261192321,
+ "num_tokens": 2210146.0,
+ "step": 900
+ },
+ {
+ "entropy": 0.4456777948141098,
+ "epoch": 2.540829986613119,
+ "grad_norm": 0.5221232175827026,
+ "learning_rate": 0.0001929142695176156,
+ "loss": 0.4049659729003906,
+ "mean_token_accuracy": 0.8767672145366668,
+ "num_tokens": 2331354.0,
+ "step": 950
+ },
+ {
+ "entropy": 0.44445386946201326,
+ "epoch": 2.674698795180723,
+ "grad_norm": 0.48457658290863037,
+ "learning_rate": 0.00019034269286698953,
+ "loss": 0.4065634536743164,
+ "mean_token_accuracy": 0.8764267575740814,
+ "num_tokens": 2452558.0,
+ "step": 1000
+ },
+ {
+ "entropy": 0.44160154819488523,
+ "epoch": 2.8085676037483265,
+ "grad_norm": 0.36902090907096863,
+ "learning_rate": 0.00018758257122802307,
+ "loss": 0.4023736953735352,
+ "mean_token_accuracy": 0.8786762475967407,
+ "num_tokens": 2582428.0,
+ "step": 1050
+ },
+ {
+ "entropy": 0.4473246121406555,
+ "epoch": 2.9424364123159306,
+ "grad_norm": 0.3760707676410675,
+ "learning_rate": 0.00018463991441338993,
+ "loss": 0.40938362121582034,
+ "mean_token_accuracy": 0.8754651814699173,
+ "num_tokens": 2707532.0,
+ "step": 1100
+ },
+ {
+ "epoch": 3.0,
+ "eval_entropy": 0.4759794683754444,
+ "eval_loss": 0.5672900676727295,
+ "eval_mean_token_accuracy": 0.843816783130169,
+ "eval_num_tokens": 2761311.0,
+ "eval_runtime": 49.9422,
+ "eval_samples_per_second": 31.997,
+ "eval_steps_per_second": 4.005,
+ "step": 1122
+ },
+ {
+ "entropy": 0.3893387094892637,
+ "epoch": 3.074966532797858,
+ "grad_norm": 0.4708668291568756,
+ "learning_rate": 0.00018152112968281706,
+ "loss": 0.3442184829711914,
+ "mean_token_accuracy": 0.8925870771359916,
+ "num_tokens": 2834344.0,
+ "step": 1150
+ },
+ {
+ "entropy": 0.36019587606191633,
+ "epoch": 3.208835341365462,
+ "grad_norm": 0.49764320254325867,
+ "learning_rate": 0.00017823300779209423,
+ "loss": 0.3140977668762207,
+ "mean_token_accuracy": 0.8975517880916596,
+ "num_tokens": 2952748.0,
+ "step": 1200
+ },
+ {
+ "entropy": 0.3615420612692833,
+ "epoch": 3.3427041499330654,
+ "grad_norm": 0.49308347702026367,
+ "learning_rate": 0.0001747827082070698,
+ "loss": 0.31728214263916016,
+ "mean_token_accuracy": 0.8988397961854935,
+ "num_tokens": 3080333.0,
+ "step": 1250
+ },
+ {
+ "entropy": 0.36845425054430964,
+ "epoch": 3.4765729585006695,
+ "grad_norm": 0.47647032141685486,
+ "learning_rate": 0.00017117774351482735,
+ "loss": 0.3203315734863281,
+ "mean_token_accuracy": 0.8968151319026947,
+ "num_tokens": 3206920.0,
+ "step": 1300
+ },
+ {
+ "entropy": 0.37361170917749403,
+ "epoch": 3.610441767068273,
+ "grad_norm": 0.5282555818557739,
+ "learning_rate": 0.000167425963065986,
+ "loss": 0.32945499420166013,
+ "mean_token_accuracy": 0.8949814730882645,
+ "num_tokens": 3327931.0,
+ "step": 1350
+ },
+ {
+ "entropy": 0.3739692223072052,
+ "epoch": 3.7443105756358768,
+ "grad_norm": 0.41107845306396484,
+ "learning_rate": 0.00016353553588374095,
+ "loss": 0.32604251861572264,
+ "mean_token_accuracy": 0.896143769621849,
+ "num_tokens": 3443874.0,
+ "step": 1400
+ },
+ {
+ "entropy": 0.37787127375602725,
+ "epoch": 3.878179384203481,
+ "grad_norm": 0.39750751852989197,
+ "learning_rate": 0.00015951493287685788,
+ "loss": 0.3352021026611328,
+ "mean_token_accuracy": 0.8935402005910873,
+ "num_tokens": 3573158.0,
+ "step": 1450
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.4485042405128479,
+ "eval_loss": 0.5753094553947449,
+ "eval_mean_token_accuracy": 0.8450798985362052,
+ "eval_num_tokens": 3681748.0,
+ "eval_runtime": 49.9605,
+ "eval_samples_per_second": 31.985,
+ "eval_steps_per_second": 4.003,
+ "step": 1496
+ },
+ {
+ "entropy": 0.3735277107869736,
+ "epoch": 4.010709504685408,
+ "grad_norm": 0.41153082251548767,
+ "learning_rate": 0.00015537290839535005,
+ "loss": 0.327095947265625,
+ "mean_token_accuracy": 0.8959399853089843,
+ "num_tokens": 3691654.0,
+ "step": 1500
+ },
+ {
+ "entropy": 0.2689096394181252,
+ "epoch": 4.144578313253012,
+ "grad_norm": 0.5363789200782776,
+ "learning_rate": 0.00015111848116899814,
+ "loss": 0.2247480583190918,
+ "mean_token_accuracy": 0.9249986118078232,
+ "num_tokens": 3813863.0,
+ "step": 1550
+ },
+ {
+ "entropy": 0.27684757232666013,
+ "epoch": 4.278447121820616,
+ "grad_norm": 0.5589100122451782,
+ "learning_rate": 0.00014676091467021694,
+ "loss": 0.23430667877197264,
+ "mean_token_accuracy": 0.9212016260623932,
+ "num_tokens": 3942009.0,
+ "step": 1600
+ },
+ {
+ "entropy": 0.27285940989851953,
+ "epoch": 4.412315930388219,
+ "grad_norm": 0.4415719211101532,
+ "learning_rate": 0.00014230969694402636,
+ "loss": 0.23151195526123047,
+ "mean_token_accuracy": 0.922565575838089,
+ "num_tokens": 4067146.0,
+ "step": 1650
+ },
+ {
+ "entropy": 0.28027778953313826,
+ "epoch": 4.546184738955823,
+ "grad_norm": 0.544822096824646,
+ "learning_rate": 0.0001377745199490439,
+ "loss": 0.23426279067993164,
+ "mean_token_accuracy": 0.9214058065414429,
+ "num_tokens": 4186586.0,
+ "step": 1700
+ },
+ {
+ "entropy": 0.2855076715350151,
+ "epoch": 4.680053547523427,
+ "grad_norm": 0.47745293378829956,
+ "learning_rate": 0.00013316525845448153,
+ "loss": 0.2384078598022461,
+ "mean_token_accuracy": 0.9208149307966232,
+ "num_tokens": 4307001.0,
+ "step": 1750
+ },
+ {
+ "entropy": 0.28488670364022256,
+ "epoch": 4.813922356091031,
+ "grad_norm": 0.6087909936904907,
+ "learning_rate": 0.00012849194853909585,
+ "loss": 0.24047565460205078,
+ "mean_token_accuracy": 0.9198217475414276,
+ "num_tokens": 4429513.0,
+ "step": 1800
+ },
+ {
+ "entropy": 0.2799084801971912,
+ "epoch": 4.947791164658635,
+ "grad_norm": 0.4444660544395447,
+ "learning_rate": 0.00012376476573890707,
+ "loss": 0.23463037490844726,
+ "mean_token_accuracy": 0.9206935846805573,
+ "num_tokens": 4557562.0,
+ "step": 1850
+ },
+ {
+ "epoch": 5.0,
+ "eval_entropy": 0.3690616069734097,
+ "eval_loss": 0.6533966064453125,
+ "eval_mean_token_accuracy": 0.8432427588105201,
+ "eval_num_tokens": 4602185.0,
+ "eval_runtime": 49.893,
+ "eval_samples_per_second": 32.029,
+ "eval_steps_per_second": 4.009,
+ "step": 1870
+ },
+ {
+ "entropy": 0.2288022293436407,
+ "epoch": 5.080321285140562,
+ "grad_norm": 0.4002731442451477,
+ "learning_rate": 0.0001189940028912678,
+ "loss": 0.17887537002563478,
+ "mean_token_accuracy": 0.9409801201386885,
+ "num_tokens": 4679483.0,
+ "step": 1900
+ },
+ {
+ "entropy": 0.192287794649601,
+ "epoch": 5.214190093708166,
+ "grad_norm": 0.4603167474269867,
+ "learning_rate": 0.00011419004772352316,
+ "loss": 0.14474411010742189,
+ "mean_token_accuracy": 0.9518448287248611,
+ "num_tokens": 4800131.0,
+ "step": 1950
+ },
+ {
+ "entropy": 0.19072901770472528,
+ "epoch": 5.34805890227577,
+ "grad_norm": 0.6232134103775024,
+ "learning_rate": 0.00010936336023505987,
+ "loss": 0.14428988456726075,
+ "mean_token_accuracy": 0.9511868554353714,
+ "num_tokens": 4923382.0,
+ "step": 2000
+ },
+ {
+ "entropy": 0.19722454741597176,
+ "epoch": 5.481927710843373,
+ "grad_norm": 0.49368390440940857,
+ "learning_rate": 0.00010452444992199237,
+ "loss": 0.15026931762695311,
+ "mean_token_accuracy": 0.9493078935146332,
+ "num_tokens": 5042200.0,
+ "step": 2050
+ },
+ {
+ "entropy": 0.1921817621588707,
+ "epoch": 5.615796519410977,
+ "grad_norm": 0.6033351421356201,
+ "learning_rate": 9.9683852894076e-05,
+ "loss": 0.15000157356262206,
+ "mean_token_accuracy": 0.9497224026918412,
+ "num_tokens": 5168285.0,
+ "step": 2100
+ },
+ {
+ "entropy": 0.19455582827329634,
+ "epoch": 5.749665327978581,
+ "grad_norm": 0.4534570276737213,
+ "learning_rate": 9.485210893367247e-05,
+ "loss": 0.14963313102722167,
+ "mean_token_accuracy": 0.94916872382164,
+ "num_tokens": 5289880.0,
+ "step": 2150
+ },
+ {
+ "entropy": 0.18738240271806716,
+ "epoch": 5.883534136546185,
+ "grad_norm": 0.5815815329551697,
+ "learning_rate": 9.003973854671866e-05,
+ "loss": 0.14579124450683595,
+ "mean_token_accuracy": 0.9498835545778275,
+ "num_tokens": 5413325.0,
+ "step": 2200
+ },
+ {
+ "epoch": 6.0,
+ "eval_entropy": 0.29356860227882864,
+ "eval_loss": 0.7589722275733948,
+ "eval_mean_token_accuracy": 0.8418136316537858,
+ "eval_num_tokens": 5522622.0,
+ "eval_runtime": 49.8769,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 2244
+ },
+ {
+ "entropy": 0.19056345296628546,
+ "epoch": 6.016064257028113,
+ "grad_norm": 0.29291921854019165,
+ "learning_rate": 8.525722005566732e-05,
+ "loss": 0.14140020370483397,
+ "mean_token_accuracy": 0.9524310213146787,
+ "num_tokens": 5536511.0,
+ "step": 2250
+ },
+ {
+ "entropy": 0.13221844218671322,
+ "epoch": 6.149933065595716,
+ "grad_norm": 0.34443414211273193,
+ "learning_rate": 8.051496678427703e-05,
+ "loss": 0.0891877555847168,
+ "mean_token_accuracy": 0.97141546189785,
+ "num_tokens": 5663054.0,
+ "step": 2300
+ },
+ {
+ "entropy": 0.13209220491349696,
+ "epoch": 6.28380187416332,
+ "grad_norm": 0.4142569899559021,
+ "learning_rate": 7.58233043839285e-05,
+ "loss": 0.08825708389282226,
+ "mean_token_accuracy": 0.9711007869243622,
+ "num_tokens": 5790379.0,
+ "step": 2350
+ },
+ {
+ "entropy": 0.14063000075519086,
+ "epoch": 6.417670682730924,
+ "grad_norm": 0.47584882378578186,
+ "learning_rate": 7.119244835083612e-05,
+ "loss": 0.09473857879638672,
+ "mean_token_accuracy": 0.9696242707967758,
+ "num_tokens": 5908852.0,
+ "step": 2400
+ },
+ {
+ "entropy": 0.13315705463290214,
+ "epoch": 6.551539491298527,
+ "grad_norm": 0.3179854452610016,
+ "learning_rate": 6.66324817831086e-05,
+ "loss": 0.0911135196685791,
+ "mean_token_accuracy": 0.9703826290369034,
+ "num_tokens": 6033966.0,
+ "step": 2450
+ },
+ {
+ "entropy": 0.13554719373583793,
+ "epoch": 6.685408299866131,
+ "grad_norm": 0.49769842624664307,
+ "learning_rate": 6.215333342608944e-05,
+ "loss": 0.09153086662292481,
+ "mean_token_accuracy": 0.9705063331127167,
+ "num_tokens": 6156278.0,
+ "step": 2500
+ },
+ {
+ "entropy": 0.13915603026747703,
+ "epoch": 6.8192771084337345,
+ "grad_norm": 0.467375785112381,
+ "learning_rate": 5.7764756053780784e-05,
+ "loss": 0.09427680969238281,
+ "mean_token_accuracy": 0.9695158433914185,
+ "num_tokens": 6276774.0,
+ "step": 2550
+ },
+ {
+ "entropy": 0.13692217327654363,
+ "epoch": 6.953145917001339,
+ "grad_norm": 0.36858609318733215,
+ "learning_rate": 5.3476305233422516e-05,
+ "loss": 0.09176054954528809,
+ "mean_token_accuracy": 0.969379341006279,
+ "num_tokens": 6401444.0,
+ "step": 2600
+ },
+ {
+ "epoch": 7.0,
+ "eval_entropy": 0.24440797246992588,
+ "eval_loss": 0.891926646232605,
+ "eval_mean_token_accuracy": 0.8400790172815323,
+ "eval_num_tokens": 6443059.0,
+ "eval_runtime": 49.9579,
+ "eval_samples_per_second": 31.987,
+ "eval_steps_per_second": 4.003,
+ "step": 2618
+ },
+ {
+ "entropy": 0.1212329932234504,
+ "epoch": 7.085676037483267,
+ "grad_norm": 0.2522813677787781,
+ "learning_rate": 4.929731851946405e-05,
+ "loss": 0.07568974018096924,
+ "mean_token_accuracy": 0.975432159924748,
+ "num_tokens": 6525142.0,
+ "step": 2650
+ },
+ {
+ "entropy": 0.10857273273169994,
+ "epoch": 7.21954484605087,
+ "grad_norm": 0.2335294634103775,
+ "learning_rate": 4.5236895122230764e-05,
+ "loss": 0.06618132591247558,
+ "mean_token_accuracy": 0.9785620093345642,
+ "num_tokens": 6651930.0,
+ "step": 2700
+ },
+ {
+ "entropy": 0.1136517857015133,
+ "epoch": 7.353413654618474,
+ "grad_norm": 0.3027023375034332,
+ "learning_rate": 4.130387609555471e-05,
+ "loss": 0.06987609386444092,
+ "mean_token_accuracy": 0.9772803634405136,
+ "num_tokens": 6768469.0,
+ "step": 2750
+ },
+ {
+ "entropy": 0.10837352603673935,
+ "epoch": 7.4872824631860775,
+ "grad_norm": 0.3191539943218231,
+ "learning_rate": 3.750682508650807e-05,
+ "loss": 0.06725080013275146,
+ "mean_token_accuracy": 0.9786273115873336,
+ "num_tokens": 6892532.0,
+ "step": 2800
+ },
+ {
+ "entropy": 0.10379995822906495,
+ "epoch": 7.621151271753681,
+ "grad_norm": 0.21721133589744568,
+ "learning_rate": 3.3854009689154384e-05,
+ "loss": 0.06510573387145996,
+ "mean_token_accuracy": 0.9790040755271912,
+ "num_tokens": 7023373.0,
+ "step": 2850
+ },
+ {
+ "entropy": 0.10860319800674915,
+ "epoch": 7.755020080321285,
+ "grad_norm": 0.35063880681991577,
+ "learning_rate": 3.0353383442917245e-05,
+ "loss": 0.06781518936157227,
+ "mean_token_accuracy": 0.9782285010814666,
+ "num_tokens": 7146448.0,
+ "step": 2900
+ },
+ {
+ "entropy": 0.11041728757321835,
+ "epoch": 7.888888888888889,
+ "grad_norm": 0.27241161465644836,
+ "learning_rate": 2.7012568514763283e-05,
+ "loss": 0.06919246673583984,
+ "mean_token_accuracy": 0.9774098896980286,
+ "num_tokens": 7267680.0,
+ "step": 2950
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.216568651124835,
+ "eval_loss": 1.0052591562271118,
+ "eval_mean_token_accuracy": 0.841527444422245,
+ "eval_num_tokens": 7363496.0,
+ "eval_runtime": 49.8931,
+ "eval_samples_per_second": 32.028,
+ "eval_steps_per_second": 4.009,
+ "step": 2992
+ },
+ {
+ "entropy": 0.11074423059971646,
+ "epoch": 8.021419009370817,
+ "grad_norm": 0.12072166800498962,
+ "learning_rate": 2.3838839102906225e-05,
+ "loss": 0.07017123222351074,
+ "mean_token_accuracy": 0.9776547671568514,
+ "num_tokens": 7384150.0,
+ "step": 3000
+ },
+ {
+ "entropy": 0.10425103880465031,
+ "epoch": 8.15528781793842,
+ "grad_norm": 0.20287242531776428,
+ "learning_rate": 2.0839105598168276e-05,
+ "loss": 0.06177260398864746,
+ "mean_token_accuracy": 0.9801955896615983,
+ "num_tokens": 7502454.0,
+ "step": 3050
+ },
+ {
+ "entropy": 0.10014630381017924,
+ "epoch": 8.289156626506024,
+ "grad_norm": 0.1157577857375145,
+ "learning_rate": 1.8019899537486024e-05,
+ "loss": 0.05763424873352051,
+ "mean_token_accuracy": 0.9802741694450379,
+ "num_tokens": 7628419.0,
+ "step": 3100
+ },
+ {
+ "entropy": 0.09626397963613272,
+ "epoch": 8.423025435073628,
+ "grad_norm": 0.12889772653579712,
+ "learning_rate": 1.5387359382322228e-05,
+ "loss": 0.05830557346343994,
+ "mean_token_accuracy": 0.9807974797487259,
+ "num_tokens": 7751912.0,
+ "step": 3150
+ },
+ {
+ "entropy": 0.09667510379105806,
+ "epoch": 8.556894243641231,
+ "grad_norm": 0.18801453709602356,
+ "learning_rate": 1.2947217152949136e-05,
+ "loss": 0.058124661445617676,
+ "mean_token_accuracy": 0.98047631919384,
+ "num_tokens": 7877804.0,
+ "step": 3200
+ },
+ {
+ "entropy": 0.09806526392698288,
+ "epoch": 8.690763052208835,
+ "grad_norm": 0.11081992089748383,
+ "learning_rate": 1.0704785947705815e-05,
+ "loss": 0.05876843929290772,
+ "mean_token_accuracy": 0.9807141083478927,
+ "num_tokens": 8003296.0,
+ "step": 3250
+ },
+ {
+ "entropy": 0.1030188063904643,
+ "epoch": 8.824631860776439,
+ "grad_norm": 0.11520951986312866,
+ "learning_rate": 8.664948374404545e-06,
+ "loss": 0.06109299659729004,
+ "mean_token_accuracy": 0.9795061159133911,
+ "num_tokens": 8123859.0,
+ "step": 3300
+ },
+ {
+ "entropy": 0.10020156983286142,
+ "epoch": 8.958500669344042,
+ "grad_norm": 0.12751302123069763,
+ "learning_rate": 6.832145919075181e-06,
+ "loss": 0.05992648124694824,
+ "mean_token_accuracy": 0.9798818999528884,
+ "num_tokens": 8246971.0,
+ "step": 3350
+ },
+ {
+ "epoch": 9.0,
+ "eval_entropy": 0.20119483806192875,
+ "eval_loss": 1.0950454473495483,
+ "eval_mean_token_accuracy": 0.8413037645816803,
+ "eval_num_tokens": 8283933.0,
+ "eval_runtime": 49.8882,
+ "eval_samples_per_second": 32.032,
+ "eval_steps_per_second": 4.009,
+ "step": 3366
+ },
+ {
+ "entropy": 0.10281606312050964,
+ "epoch": 9.09103078982597,
+ "grad_norm": 0.15680046379566193,
+ "learning_rate": 5.210369275196194e-06,
+ "loss": 0.06013503551483154,
+ "mean_token_accuracy": 0.9802021769562153,
+ "num_tokens": 8361698.0,
+ "step": 3400
+ },
+ {
+ "entropy": 0.1000148943066597,
+ "epoch": 9.224899598393574,
+ "grad_norm": 0.13952124118804932,
+ "learning_rate": 3.803149654468773e-06,
+ "loss": 0.05827256202697754,
+ "mean_token_accuracy": 0.9804249608516693,
+ "num_tokens": 8479562.0,
+ "step": 3450
+ },
+ {
+ "entropy": 0.0985061563923955,
+ "epoch": 9.358768406961179,
+ "grad_norm": 0.15697507560253143,
+ "learning_rate": 2.6135510980540095e-06,
+ "loss": 0.05692038536071777,
+ "mean_token_accuracy": 0.9809943473339081,
+ "num_tokens": 8601271.0,
+ "step": 3500
+ },
+ {
+ "entropy": 0.09138282071799039,
+ "epoch": 9.492637215528783,
+ "grad_norm": 0.09984570741653442,
+ "learning_rate": 1.6441638050141134e-06,
+ "loss": 0.0537615442276001,
+ "mean_token_accuracy": 0.9819633334875106,
+ "num_tokens": 8730858.0,
+ "step": 3550
+ },
+ {
+ "entropy": 0.09622499626129866,
+ "epoch": 9.626506024096386,
+ "grad_norm": 0.15339775383472443,
+ "learning_rate": 8.970984924845772e-07,
+ "loss": 0.05584990501403809,
+ "mean_token_accuracy": 0.9813891124725341,
+ "num_tokens": 8855189.0,
+ "step": 3600
+ },
+ {
+ "entropy": 0.09465554103255272,
+ "epoch": 9.76037483266399,
+ "grad_norm": 0.15608705580234528,
+ "learning_rate": 3.7398179985693506e-07,
+ "loss": 0.055440669059753415,
+ "mean_token_accuracy": 0.981783646941185,
+ "num_tokens": 8979510.0,
+ "step": 3650
+ },
+ {
+ "entropy": 0.09096938490867615,
+ "epoch": 9.894243641231594,
+ "grad_norm": 0.1507030725479126,
+ "learning_rate": 7.595274697899605e-08,
+ "loss": 0.05279422283172607,
+ "mean_token_accuracy": 0.9820951598882676,
+ "num_tokens": 9111678.0,
+ "step": 3700
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.19496785469353198,
+ "eval_loss": 1.1435716152191162,
+ "eval_mean_token_accuracy": 0.8412449145317078,
+ "eval_num_tokens": 9204370.0,
+ "eval_runtime": 49.8996,
+ "eval_samples_per_second": 32.024,
+ "eval_steps_per_second": 4.008,
+ "step": 3740
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.0880231228879974e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67d7b089767a1f6708064da5a07f65728eada3e
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 256,
+ "lora_bias": false,
+ "lora_dropout": 0.0016857635936814886,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 128,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "gate_proj",
+ "k_proj",
+ "down_proj",
+ "o_proj",
+ "v_proj",
+ "up_proj",
+ "q_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a73aa8e422ab932ece21b234fc42868cf085a1d
--- /dev/null
+++ b/DBCA_original_Swedish/Qwen3-4B-Base_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json
@@ -0,0 +1,196 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 748,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 1.3424121737480164,
+ "epoch": 0.13386880856760375,
+ "grad_norm": 1.1671011447906494,
+ "learning_rate": 2.7185774847148058e-05,
+ "loss": 1.2469227600097657,
+ "mean_token_accuracy": 0.7243366748094558,
+ "num_tokens": 122643.0,
+ "step": 50
+ },
+ {
+ "entropy": 0.6559184691309929,
+ "epoch": 0.2677376171352075,
+ "grad_norm": 0.8399614691734314,
+ "learning_rate": 5.4926361425870564e-05,
+ "loss": 0.6245294189453126,
+ "mean_token_accuracy": 0.8268856984376908,
+ "num_tokens": 245702.0,
+ "step": 100
+ },
+ {
+ "entropy": 0.6082554489374161,
+ "epoch": 0.40160642570281124,
+ "grad_norm": 0.5691282749176025,
+ "learning_rate": 8.266694800459306e-05,
+ "loss": 0.5804204177856446,
+ "mean_token_accuracy": 0.8374843555688858,
+ "num_tokens": 371834.0,
+ "step": 150
+ },
+ {
+ "entropy": 0.5805956655740738,
+ "epoch": 0.535475234270415,
+ "grad_norm": 0.45651012659072876,
+ "learning_rate": 0.00011040753458331558,
+ "loss": 0.5495451354980468,
+ "mean_token_accuracy": 0.8450068402290344,
+ "num_tokens": 500948.0,
+ "step": 200
+ },
+ {
+ "entropy": 0.5810796636343002,
+ "epoch": 0.6693440428380187,
+ "grad_norm": 0.493431955575943,
+ "learning_rate": 0.00013814812116203808,
+ "loss": 0.5546633911132812,
+ "mean_token_accuracy": 0.8431127589941024,
+ "num_tokens": 621874.0,
+ "step": 250
+ },
+ {
+ "entropy": 0.5683389616012573,
+ "epoch": 0.8032128514056225,
+ "grad_norm": 0.4698493182659149,
+ "learning_rate": 0.00016588870774076058,
+ "loss": 0.5354468536376953,
+ "mean_token_accuracy": 0.8477097982168198,
+ "num_tokens": 746138.0,
+ "step": 300
+ },
+ {
+ "entropy": 0.5579970148205757,
+ "epoch": 0.9370816599732262,
+ "grad_norm": 0.5637578964233398,
+ "learning_rate": 0.0001936292943194831,
+ "loss": 0.5262400054931641,
+ "mean_token_accuracy": 0.8487933957576752,
+ "num_tokens": 868331.0,
+ "step": 350
+ },
+ {
+ "epoch": 1.0,
+ "eval_entropy": 0.5953671643137932,
+ "eval_loss": 0.5836789608001709,
+ "eval_mean_token_accuracy": 0.8373425653576851,
+ "eval_num_tokens": 920437.0,
+ "eval_runtime": 49.8765,
+ "eval_samples_per_second": 32.039,
+ "eval_steps_per_second": 4.01,
+ "step": 374
+ },
+ {
+ "entropy": 0.5491663054986433,
+ "epoch": 1.069611780455154,
+ "grad_norm": 0.6051601767539978,
+ "learning_rate": 0.0002074713460228683,
+ "loss": 0.5132473373413086,
+ "mean_token_accuracy": 0.8526828770685677,
+ "num_tokens": 980838.0,
+ "step": 400
+ },
+ {
+ "entropy": 0.5334427100419998,
+ "epoch": 1.2034805890227578,
+ "grad_norm": 0.4304046630859375,
+ "learning_rate": 0.00020724550557791978,
+ "loss": 0.497388916015625,
+ "mean_token_accuracy": 0.8541187030076981,
+ "num_tokens": 1104763.0,
+ "step": 450
+ },
+ {
+ "entropy": 0.5297759872674942,
+ "epoch": 1.3373493975903614,
+ "grad_norm": 0.4243815541267395,
+ "learning_rate": 0.00020679431642677408,
+ "loss": 0.49724563598632815,
+ "mean_token_accuracy": 0.8557562667131424,
+ "num_tokens": 1230459.0,
+ "step": 500
+ },
+ {
+ "entropy": 0.5287212440371514,
+ "epoch": 1.4712182061579653,
+ "grad_norm": 0.5970085859298706,
+ "learning_rate": 0.0002061187609762355,
+ "loss": 0.4903334808349609,
+ "mean_token_accuracy": 0.8566980129480362,
+ "num_tokens": 1356368.0,
+ "step": 550
+ },
+ {
+ "entropy": 0.5196757692098618,
+ "epoch": 1.605087014725569,
+ "grad_norm": 0.353085458278656,
+ "learning_rate": 0.00020522031016209576,
+ "loss": 0.48056564331054685,
+ "mean_token_accuracy": 0.8591135066747665,
+ "num_tokens": 1484569.0,
+ "step": 600
+ },
+ {
+ "entropy": 0.5124905353784561,
+ "epoch": 1.7389558232931726,
+ "grad_norm": 0.383682519197464,
+ "learning_rate": 0.00020410092024635923,
+ "loss": 0.47599597930908205,
+ "mean_token_accuracy": 0.8606968414783478,
+ "num_tokens": 1609962.0,
+ "step": 650
+ },
+ {
+ "entropy": 0.5109778612852096,
+ "epoch": 1.8728246318607764,
+ "grad_norm": 0.35959598422050476,
+ "learning_rate": 0.00020276302855773176,
+ "loss": 0.47350929260253904,
+ "mean_token_accuracy": 0.862987876534462,
+ "num_tokens": 1729667.0,
+ "step": 700
+ },
+ {
+ "epoch": 2.0,
+ "eval_entropy": 0.5806624293327332,
+ "eval_loss": 0.5672553181648254,
+ "eval_mean_token_accuracy": 0.8352122050523758,
+ "eval_num_tokens": 1840874.0,
+ "eval_runtime": 49.8669,
+ "eval_samples_per_second": 32.045,
+ "eval_steps_per_second": 4.011,
+ "step": 748
+ }
+ ],
+ "logging_steps": 50,
+ "max_steps": 3740,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0167293607351706e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1875666670146c89e02e027ba11c310a297e6706
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3020/trainer_state.json
@@ -0,0 +1,3205 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.277442702050664,
+ "eval_steps": 20,
+ "global_step": 3020,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.5588208342934016e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f687182abb2b000330496898ed667ad4e50e6ea
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3040/trainer_state.json
@@ -0,0 +1,3226 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.325693606755126,
+ "eval_steps": 20,
+ "global_step": 3040,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.5884529474138624e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e13f384816b0831238b7adb2a32741d3037380c
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3060/trainer_state.json
@@ -0,0 +1,3247 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.37394451145959,
+ "eval_steps": 20,
+ "global_step": 3060,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.619818601629747e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd60b0849e427c00dfc36e07c3f23630ca683023
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3080/trainer_state.json
@@ -0,0 +1,3268 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.422195416164053,
+ "eval_steps": 20,
+ "global_step": 3080,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.6497921431486976e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fae6dfa855f3a17fef64d96950ad7a68b07fea93
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3100/trainer_state.json
@@ -0,0 +1,3289 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.470446320868517,
+ "eval_steps": 20,
+ "global_step": 3100,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.677595678403021e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..18dbac322528fbf5ff116f0f99e4fde0653df402
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3120/trainer_state.json
@@ -0,0 +1,3310 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.518697225572979,
+ "eval_steps": 20,
+ "global_step": 3120,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.7098589021200896e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..444da11af0e9943c87bc71d5162369774b3ba03e
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3140/trainer_state.json
@@ -0,0 +1,3331 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.566948130277443,
+ "eval_steps": 20,
+ "global_step": 3140,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.742792663052749e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5936cad0de2afc112537f6b102cd2808c80804b3
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3160/trainer_state.json
@@ -0,0 +1,3352 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.615199034981906,
+ "eval_steps": 20,
+ "global_step": 3160,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.774101999182285e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5bce9e9cb5ed34732dd937291baa9ff94108599
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3180/trainer_state.json
@@ -0,0 +1,3373 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.66344993968637,
+ "eval_steps": 20,
+ "global_step": 3180,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.8035704378642944e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc5c64d87da635eca649acbc38be83e98f05fb3f
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-320/trainer_state.json
@@ -0,0 +1,370 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.7720144752714113,
+ "eval_steps": 20,
+ "global_step": 320,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.911570708086784e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf4f54c3f38ce689e2ef9480b7a3174042ecd8a2
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3200/trainer_state.json
@@ -0,0 +1,3394 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.711700844390832,
+ "eval_steps": 20,
+ "global_step": 3200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.831581646062029e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d19773490f58e7da9635012ee482212f6baeb73f
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3220/trainer_state.json
@@ -0,0 +1,3415 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.759951749095295,
+ "eval_steps": 20,
+ "global_step": 3220,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.86267626948736e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..706ca903c2185329afdcaaae04165376f19117d1
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3240/trainer_state.json
@@ -0,0 +1,3436 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.808202653799759,
+ "eval_steps": 20,
+ "global_step": 3240,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.894677262114867e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..911c23180706772e28d978c164391e58ba3ddb19
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3260/trainer_state.json
@@ -0,0 +1,3457 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.856453558504222,
+ "eval_steps": 20,
+ "global_step": 3260,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.926100994357299e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b96b1670bb8c47dcc688f21710f259d6fef34216
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3280/trainer_state.json
@@ -0,0 +1,3478 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.904704463208685,
+ "eval_steps": 20,
+ "global_step": 3280,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.9552209648800256e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..21898c2c92025c319d9087f7528cd9bfe2dd1bc4
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3300/trainer_state.json
@@ -0,0 +1,3499 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 7.952955367913148,
+ "eval_steps": 20,
+ "global_step": 3300,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.985129388611635e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3946a9434b242725d4a6aa74ac671d305285f00
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3320/trainer_state.json
@@ -0,0 +1,3520 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.0,
+ "eval_steps": 20,
+ "global_step": 3320,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.0124324408720384e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd0b02e1edf669fdc6a109b632e60ec2df1a4897
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3340/trainer_state.json
@@ -0,0 +1,3541 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.048250904704464,
+ "eval_steps": 20,
+ "global_step": 3340,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.0405738846444544e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc2925148ae483c103bc106ffb35da68276827f6
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3360/trainer_state.json
@@ -0,0 +1,3562 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.096501809408926,
+ "eval_steps": 20,
+ "global_step": 3360,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.0682313448623104e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..34385e35241ec7cae6cc44d36f3309df3c900c4d
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3380/trainer_state.json
@@ -0,0 +1,3583 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.14475271411339,
+ "eval_steps": 20,
+ "global_step": 3380,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.098549834660147e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d966192d99130a657b653bcb78a3f79b8e8161d3
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-340/trainer_state.json
@@ -0,0 +1,391 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.8202653799758746,
+ "eval_steps": 20,
+ "global_step": 340,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.22209455669248e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..764e62f67588886399dc7b313c296d25e38cdfcb
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3400/trainer_state.json
@@ -0,0 +1,3604 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.193003618817853,
+ "eval_steps": 20,
+ "global_step": 3400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.1325131606088704e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..27e85f1b827de56d9b45d6d5c3bee4ce0c004a48
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3420/trainer_state.json
@@ -0,0 +1,3625 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.241254523522317,
+ "eval_steps": 20,
+ "global_step": 3420,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.1628334103469056e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd171ceb05290457aaefc0622ba9194ac2d36953
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3440/trainer_state.json
@@ -0,0 +1,3646 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.289505428226779,
+ "eval_steps": 20,
+ "global_step": 3440,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.189239553083699e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcc509f733688600cad3436810e33db0aadbe3a0
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3460/trainer_state.json
@@ -0,0 +1,3667 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.337756332931242,
+ "eval_steps": 20,
+ "global_step": 3460,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.2187555101510656e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aebed85a25f7b5ce60770d403ed311a4f7f1af87
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3480/trainer_state.json
@@ -0,0 +1,3688 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.386007237635706,
+ "eval_steps": 20,
+ "global_step": 3480,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.250758262718771e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..33bfa6a3eadf85968bcbd375da520b4e3478477d
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3500/trainer_state.json
@@ -0,0 +1,3709 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.43425814234017,
+ "eval_steps": 20,
+ "global_step": 3500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.281882805127475e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ecc3092b47c9bcc5c5b9f00c9c3d8a928237795
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3520/trainer_state.json
@@ -0,0 +1,3730 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.482509047044632,
+ "eval_steps": 20,
+ "global_step": 3520,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.311726111071744e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab52c6775bd9bff5aae74c43afefe1f05682e1d2
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3540/trainer_state.json
@@ -0,0 +1,3751 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.530759951749095,
+ "eval_steps": 20,
+ "global_step": 3540,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.3430284074404864e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b04706628dec08abeb10745f419aa6be3ff5cb57
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3560/trainer_state.json
@@ -0,0 +1,3772 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.579010856453559,
+ "eval_steps": 20,
+ "global_step": 3560,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.3726622805011456e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..20b0f94c9dc2c641cc7f11e769f02bffc1bdd675
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3580/trainer_state.json
@@ -0,0 +1,3793 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.627261761158021,
+ "eval_steps": 20,
+ "global_step": 3580,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.405333810344243e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..246ac18182157e7f1f83a5f33ec269dfcd845d63
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-360/trainer_state.json
@@ -0,0 +1,412 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.8685162846803377,
+ "eval_steps": 20,
+ "global_step": 360,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.512766279860224e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..caa6a6d9ad15698c072c595439b01254e42a37ff
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3600/trainer_state.json
@@ -0,0 +1,3814 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.675512665862485,
+ "eval_steps": 20,
+ "global_step": 3600,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.4358529333246976e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e767c9f7195566248c41fd93951a5336b7b16f18
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3620/trainer_state.json
@@ -0,0 +1,3835 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.723763570566948,
+ "eval_steps": 20,
+ "global_step": 3620,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.4669246775274496e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0932f4456d576c6b0206229802fcb19b93841b2b
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3640/trainer_state.json
@@ -0,0 +1,3856 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.772014475271412,
+ "eval_steps": 20,
+ "global_step": 3640,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.498913350573568e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f624b42e08260bf994e70c06a6ca43df45405f3
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3660/trainer_state.json
@@ -0,0 +1,3877 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.820265379975874,
+ "eval_steps": 20,
+ "global_step": 3660,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.5266922466651136e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ee7507dac4ad8eeb47b67e89663c449c02e6524
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3680/trainer_state.json
@@ -0,0 +1,3898 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.868516284680338,
+ "eval_steps": 20,
+ "global_step": 3680,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.557906546023936e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7e6ef7740c4778fd5ad573eefd3ad5052b3c517
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3700/trainer_state.json
@@ -0,0 +1,3919 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.916767189384801,
+ "eval_steps": 20,
+ "global_step": 3700,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.587241229250867e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..96b4d58444f3c4523955ceaf9f94c79393ee3797
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3720/trainer_state.json
@@ -0,0 +1,3940 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.965018094089265,
+ "eval_steps": 20,
+ "global_step": 3720,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.618293614111437e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..de846c80f2f0466d1512a59f1b376d26411815eb
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3740/trainer_state.json
@@ -0,0 +1,3961 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.012062726176115,
+ "eval_steps": 20,
+ "global_step": 3740,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.6462551039985664e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8764653500cd0c471cc5af4b34dd2fb9ce4edd0
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3760/trainer_state.json
@@ -0,0 +1,3982 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.060313630880579,
+ "eval_steps": 20,
+ "global_step": 3760,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.6746693385017344e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad4a6d9c85e62edaf15c1822bb2b9288fa4b6916
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3780/trainer_state.json
@@ -0,0 +1,4003 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.108564535585042,
+ "eval_steps": 20,
+ "global_step": 3780,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.701782977198285e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6889fb4a6e581cf454b09fd4b4163c5a37b39aca
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-380/trainer_state.json
@@ -0,0 +1,433 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.916767189384801,
+ "eval_steps": 20,
+ "global_step": 380,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.782477115265024e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c78793f469ccb48f644f68eeaca77830e9c16d1a
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3800/trainer_state.json
@@ -0,0 +1,4024 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.156815440289506,
+ "eval_steps": 20,
+ "global_step": 3800,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.734250353978368e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b63403682192b7dd91cfba91d98862a3d8f45c4
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3820/trainer_state.json
@@ -0,0 +1,4045 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.205066344993968,
+ "eval_steps": 20,
+ "global_step": 3820,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.766591015064166e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7da928a01a9ad86a79c29a0d1cd90c70bd6a6e3e
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3840/trainer_state.json
@@ -0,0 +1,4066 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.253317249698432,
+ "eval_steps": 20,
+ "global_step": 3840,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.798039386469376e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6b5d112b99ffb7d1d8d017f9427e64934ed3e59
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3860/trainer_state.json
@@ -0,0 +1,4087 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.301568154402895,
+ "eval_steps": 20,
+ "global_step": 3860,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.827893252054835e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f64c7d92ca51d624b16eb0cd736965cbab817ef
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3880/trainer_state.json
@@ -0,0 +1,4108 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.349819059107359,
+ "eval_steps": 20,
+ "global_step": 3880,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.85832437802537e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab81f60b899f34d88e31437457a50f64f888fe02
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3900/trainer_state.json
@@ -0,0 +1,4129 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.398069963811821,
+ "eval_steps": 20,
+ "global_step": 3900,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.887157478295757e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4d8a17085ce1fb264da87e7f4d6eea30f096053
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3920/trainer_state.json
@@ -0,0 +1,4150 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.446320868516285,
+ "eval_steps": 20,
+ "global_step": 3920,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.916918067050701e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..11f3ef2a29d9b95de5a93587e2f4274fd037b534
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3940/trainer_state.json
@@ -0,0 +1,4171 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.494571773220748,
+ "eval_steps": 20,
+ "global_step": 3940,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.9500278220032e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..99d4cbc6cecf2fae622bcc7c3e55e4ce3c326cae
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3960/trainer_state.json
@@ -0,0 +1,4192 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.54282267792521,
+ "eval_steps": 20,
+ "global_step": 3960,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.977695841862246e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..70d6c02f8ec86b19c045682b96ab6f6e0590d754
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-3980/trainer_state.json
@@ -0,0 +1,4213 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.591073582629674,
+ "eval_steps": 20,
+ "global_step": 3980,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.00520194722304e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..75455a63f2ec348fc0447419af314dd24bc30976
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-40/trainer_state.json
@@ -0,0 +1,76 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.09650180940892641,
+ "eval_steps": 20,
+ "global_step": 40,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5990484447313920.0,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7200eb556657006c72b25c612af74af9a02996b5
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-400/trainer_state.json
@@ -0,0 +1,454 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.9650180940892642,
+ "eval_steps": 20,
+ "global_step": 400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.083550085005312e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7f9405d9329cf7a8d5c17b1e530b6a335b50ef2
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4000/trainer_state.json
@@ -0,0 +1,4234 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.639324487334138,
+ "eval_steps": 20,
+ "global_step": 4000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.033403228962202e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4984a4eca1204d09b3588e203ae5513197325724
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4020/trainer_state.json
@@ -0,0 +1,4255 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.687575392038601,
+ "eval_steps": 20,
+ "global_step": 4020,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.064133544766464e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..96ffb71ad5524373a3bddde46cfec956684cad63
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4040/trainer_state.json
@@ -0,0 +1,4276 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.735826296743063,
+ "eval_steps": 20,
+ "global_step": 4040,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.094393956537754e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4915d4c9422633732b78d88ce7ed1a10fdc0383
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4060/trainer_state.json
@@ -0,0 +1,4297 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.784077201447527,
+ "eval_steps": 20,
+ "global_step": 4060,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.124510053212774e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f73da204a790970671f1252563210ae5cdf9f67
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4080/trainer_state.json
@@ -0,0 +1,4318 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.83232810615199,
+ "eval_steps": 20,
+ "global_step": 4080,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ },
+ {
+ "entropy": 0.13877951726317406,
+ "epoch": 9.83232810615199,
+ "grad_norm": 0.36521604657173157,
+ "learning_rate": 2.1138434098667948e-07,
+ "loss": 0.0738587200641632,
+ "mean_token_accuracy": 0.9764896467328071,
+ "num_tokens": 11441968.0,
+ "step": 4080
+ },
+ {
+ "epoch": 9.83232810615199,
+ "eval_entropy": 0.24834532483240193,
+ "eval_loss": 0.8350111246109009,
+ "eval_mean_token_accuracy": 0.8456257598453694,
+ "eval_num_tokens": 11441968.0,
+ "eval_runtime": 55.1871,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4080
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.158010514889318e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef7c15e6c3905b6f953723c4cf34752ff6f6994a
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4100/trainer_state.json
@@ -0,0 +1,4339 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.880579010856454,
+ "eval_steps": 20,
+ "global_step": 4100,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ },
+ {
+ "entropy": 0.13877951726317406,
+ "epoch": 9.83232810615199,
+ "grad_norm": 0.36521604657173157,
+ "learning_rate": 2.1138434098667948e-07,
+ "loss": 0.0738587200641632,
+ "mean_token_accuracy": 0.9764896467328071,
+ "num_tokens": 11441968.0,
+ "step": 4080
+ },
+ {
+ "epoch": 9.83232810615199,
+ "eval_entropy": 0.24834532483240193,
+ "eval_loss": 0.8350111246109009,
+ "eval_mean_token_accuracy": 0.8456257598453694,
+ "eval_num_tokens": 11441968.0,
+ "eval_runtime": 55.1871,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4080
+ },
+ {
+ "entropy": 0.1345276204869151,
+ "epoch": 9.880579010856454,
+ "grad_norm": 0.45626401901245117,
+ "learning_rate": 1.0908347025708512e-07,
+ "loss": 0.07468653917312622,
+ "mean_token_accuracy": 0.978096280992031,
+ "num_tokens": 11500487.0,
+ "step": 4100
+ },
+ {
+ "epoch": 9.880579010856454,
+ "eval_entropy": 0.2485178895713238,
+ "eval_loss": 0.834865152835846,
+ "eval_mean_token_accuracy": 0.8453632285085957,
+ "eval_num_tokens": 11500487.0,
+ "eval_runtime": 55.1746,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 4100
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.189029460886118e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef6e08d25f89f60b81bcbd37ef1335478f4511b8
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4120/trainer_state.json
@@ -0,0 +1,4360 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.928829915560916,
+ "eval_steps": 20,
+ "global_step": 4120,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ },
+ {
+ "entropy": 0.13877951726317406,
+ "epoch": 9.83232810615199,
+ "grad_norm": 0.36521604657173157,
+ "learning_rate": 2.1138434098667948e-07,
+ "loss": 0.0738587200641632,
+ "mean_token_accuracy": 0.9764896467328071,
+ "num_tokens": 11441968.0,
+ "step": 4080
+ },
+ {
+ "epoch": 9.83232810615199,
+ "eval_entropy": 0.24834532483240193,
+ "eval_loss": 0.8350111246109009,
+ "eval_mean_token_accuracy": 0.8456257598453694,
+ "eval_num_tokens": 11441968.0,
+ "eval_runtime": 55.1871,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4080
+ },
+ {
+ "entropy": 0.1345276204869151,
+ "epoch": 9.880579010856454,
+ "grad_norm": 0.45626401901245117,
+ "learning_rate": 1.0908347025708512e-07,
+ "loss": 0.07468653917312622,
+ "mean_token_accuracy": 0.978096280992031,
+ "num_tokens": 11500487.0,
+ "step": 4100
+ },
+ {
+ "epoch": 9.880579010856454,
+ "eval_entropy": 0.2485178895713238,
+ "eval_loss": 0.834865152835846,
+ "eval_mean_token_accuracy": 0.8453632285085957,
+ "eval_num_tokens": 11500487.0,
+ "eval_runtime": 55.1746,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 4100
+ },
+ {
+ "entropy": 0.1314420524984598,
+ "epoch": 9.928829915560916,
+ "grad_norm": 0.5756514072418213,
+ "learning_rate": 4.0307324700819896e-08,
+ "loss": 0.07114983201026917,
+ "mean_token_accuracy": 0.9784522473812103,
+ "num_tokens": 11562246.0,
+ "step": 4120
+ },
+ {
+ "epoch": 9.928829915560916,
+ "eval_entropy": 0.24849412364236426,
+ "eval_loss": 0.8347920775413513,
+ "eval_mean_token_accuracy": 0.8454210158814205,
+ "eval_num_tokens": 11562246.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 4120
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.222512323160678e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf3f09a807315c9f9fb1ad28b7112a9c146a267d
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4140/trainer_state.json
@@ -0,0 +1,4381 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.97708082026538,
+ "eval_steps": 20,
+ "global_step": 4140,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ },
+ {
+ "entropy": 0.13877951726317406,
+ "epoch": 9.83232810615199,
+ "grad_norm": 0.36521604657173157,
+ "learning_rate": 2.1138434098667948e-07,
+ "loss": 0.0738587200641632,
+ "mean_token_accuracy": 0.9764896467328071,
+ "num_tokens": 11441968.0,
+ "step": 4080
+ },
+ {
+ "epoch": 9.83232810615199,
+ "eval_entropy": 0.24834532483240193,
+ "eval_loss": 0.8350111246109009,
+ "eval_mean_token_accuracy": 0.8456257598453694,
+ "eval_num_tokens": 11441968.0,
+ "eval_runtime": 55.1871,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4080
+ },
+ {
+ "entropy": 0.1345276204869151,
+ "epoch": 9.880579010856454,
+ "grad_norm": 0.45626401901245117,
+ "learning_rate": 1.0908347025708512e-07,
+ "loss": 0.07468653917312622,
+ "mean_token_accuracy": 0.978096280992031,
+ "num_tokens": 11500487.0,
+ "step": 4100
+ },
+ {
+ "epoch": 9.880579010856454,
+ "eval_entropy": 0.2485178895713238,
+ "eval_loss": 0.834865152835846,
+ "eval_mean_token_accuracy": 0.8453632285085957,
+ "eval_num_tokens": 11500487.0,
+ "eval_runtime": 55.1746,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 4100
+ },
+ {
+ "entropy": 0.1314420524984598,
+ "epoch": 9.928829915560916,
+ "grad_norm": 0.5756514072418213,
+ "learning_rate": 4.0307324700819896e-08,
+ "loss": 0.07114983201026917,
+ "mean_token_accuracy": 0.9784522473812103,
+ "num_tokens": 11562246.0,
+ "step": 4120
+ },
+ {
+ "epoch": 9.928829915560916,
+ "eval_entropy": 0.24849412364236426,
+ "eval_loss": 0.8347920775413513,
+ "eval_mean_token_accuracy": 0.8454210158814205,
+ "eval_num_tokens": 11562246.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 4120
+ },
+ {
+ "entropy": 0.14091254398226738,
+ "epoch": 9.97708082026538,
+ "grad_norm": 0.4619421064853668,
+ "learning_rate": 5.075367153567275e-09,
+ "loss": 0.07807959914207459,
+ "mean_token_accuracy": 0.9760556846857071,
+ "num_tokens": 11614714.0,
+ "step": 4140
+ },
+ {
+ "epoch": 9.97708082026538,
+ "eval_entropy": 0.24850971368926295,
+ "eval_loss": 0.8348681926727295,
+ "eval_mean_token_accuracy": 0.8453842609116201,
+ "eval_num_tokens": 11614714.0,
+ "eval_runtime": 55.1689,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 4140
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.24999906917929e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..85d7ccdc7a5e89ba20ea4d1c3cc1f99a77dc3116
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-4150/trainer_state.json
@@ -0,0 +1,4392 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.0,
+ "eval_steps": 20,
+ "global_step": 4150,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ },
+ {
+ "entropy": 0.5248019628226757,
+ "epoch": 2.4101326899879374,
+ "grad_norm": 0.4942329525947571,
+ "learning_rate": 0.00022313151729919296,
+ "loss": 0.4856869220733643,
+ "mean_token_accuracy": 0.8571616068482399,
+ "num_tokens": 2808227.0,
+ "step": 1000
+ },
+ {
+ "epoch": 2.4101326899879374,
+ "eval_entropy": 0.5513194481308541,
+ "eval_loss": 0.5423293709754944,
+ "eval_mean_token_accuracy": 0.8457836836911319,
+ "eval_num_tokens": 2808227.0,
+ "eval_runtime": 55.2589,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1000
+ },
+ {
+ "entropy": 0.5259068854153156,
+ "epoch": 2.4583835946924006,
+ "grad_norm": 0.36930060386657715,
+ "learning_rate": 0.00022217585350392177,
+ "loss": 0.4831561088562012,
+ "mean_token_accuracy": 0.8585615202784538,
+ "num_tokens": 2867952.0,
+ "step": 1020
+ },
+ {
+ "epoch": 2.4583835946924006,
+ "eval_entropy": 0.5478445388627856,
+ "eval_loss": 0.5382154583930969,
+ "eval_mean_token_accuracy": 0.8472998199168216,
+ "eval_num_tokens": 2867952.0,
+ "eval_runtime": 55.2641,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1020
+ },
+ {
+ "entropy": 0.5247037045657634,
+ "epoch": 2.5066344993968634,
+ "grad_norm": 0.4037776291370392,
+ "learning_rate": 0.00022119087216470113,
+ "loss": 0.4769702434539795,
+ "mean_token_accuracy": 0.8580174028873444,
+ "num_tokens": 2921659.0,
+ "step": 1040
+ },
+ {
+ "epoch": 2.5066344993968634,
+ "eval_entropy": 0.5579622049679916,
+ "eval_loss": 0.537755012512207,
+ "eval_mean_token_accuracy": 0.847535682863064,
+ "eval_num_tokens": 2921659.0,
+ "eval_runtime": 55.2532,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 1040
+ },
+ {
+ "entropy": 0.5346525005996228,
+ "epoch": 2.554885404101327,
+ "grad_norm": 0.4666038453578949,
+ "learning_rate": 0.00022017685201959885,
+ "loss": 0.4858390331268311,
+ "mean_token_accuracy": 0.8568937763571739,
+ "num_tokens": 2976634.0,
+ "step": 1060
+ },
+ {
+ "epoch": 2.554885404101327,
+ "eval_entropy": 0.5403439740786392,
+ "eval_loss": 0.5404531359672546,
+ "eval_mean_token_accuracy": 0.847268155451571,
+ "eval_num_tokens": 2976634.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 1060
+ },
+ {
+ "entropy": 0.52818808183074,
+ "epoch": 2.60313630880579,
+ "grad_norm": 0.46565404534339905,
+ "learning_rate": 0.00021913408002432124,
+ "loss": 0.48402113914489747,
+ "mean_token_accuracy": 0.8563135221600533,
+ "num_tokens": 3033832.0,
+ "step": 1080
+ },
+ {
+ "epoch": 2.60313630880579,
+ "eval_entropy": 0.5540322053633379,
+ "eval_loss": 0.5376110076904297,
+ "eval_mean_token_accuracy": 0.8479098319337609,
+ "eval_num_tokens": 3033832.0,
+ "eval_runtime": 55.2609,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1080
+ },
+ {
+ "entropy": 0.5257870592176914,
+ "epoch": 2.651387213510253,
+ "grad_norm": 0.42831000685691833,
+ "learning_rate": 0.00021806285127100823,
+ "loss": 0.48136534690856936,
+ "mean_token_accuracy": 0.8569760799407959,
+ "num_tokens": 3091618.0,
+ "step": 1100
+ },
+ {
+ "epoch": 2.651387213510253,
+ "eval_entropy": 0.5380372698052546,
+ "eval_loss": 0.5353341698646545,
+ "eval_mean_token_accuracy": 0.8481398174601994,
+ "eval_num_tokens": 3091618.0,
+ "eval_runtime": 55.2823,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 1100
+ },
+ {
+ "entropy": 0.5353380210697651,
+ "epoch": 2.6996381182147164,
+ "grad_norm": 0.42232778668403625,
+ "learning_rate": 0.00021696346890472552,
+ "loss": 0.49104962348937986,
+ "mean_token_accuracy": 0.8557631894946098,
+ "num_tokens": 3145557.0,
+ "step": 1120
+ },
+ {
+ "epoch": 2.6996381182147164,
+ "eval_entropy": 0.5383762990155917,
+ "eval_loss": 0.5312851071357727,
+ "eval_mean_token_accuracy": 0.8492028927535153,
+ "eval_num_tokens": 3145557.0,
+ "eval_runtime": 55.2845,
+ "eval_samples_per_second": 25.685,
+ "eval_steps_per_second": 3.22,
+ "step": 1120
+ },
+ {
+ "entropy": 0.5406845368444919,
+ "epoch": 2.7478890229191797,
+ "grad_norm": 0.4468596577644348,
+ "learning_rate": 0.0002158362440376784,
+ "loss": 0.49189152717590334,
+ "mean_token_accuracy": 0.8547335088253021,
+ "num_tokens": 3197564.0,
+ "step": 1140
+ },
+ {
+ "epoch": 2.7478890229191797,
+ "eval_entropy": 0.5433913787429252,
+ "eval_loss": 0.530707597732544,
+ "eval_mean_token_accuracy": 0.8490624461281165,
+ "eval_num_tokens": 3197564.0,
+ "eval_runtime": 55.287,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1140
+ },
+ {
+ "entropy": 0.5258526459336281,
+ "epoch": 2.796139927623643,
+ "grad_norm": 0.391728937625885,
+ "learning_rate": 0.0002146814956611704,
+ "loss": 0.4771277904510498,
+ "mean_token_accuracy": 0.8589153334498405,
+ "num_tokens": 3254632.0,
+ "step": 1160
+ },
+ {
+ "epoch": 2.796139927623643,
+ "eval_entropy": 0.5291422690903202,
+ "eval_loss": 0.5308067798614502,
+ "eval_mean_token_accuracy": 0.8499355989225795,
+ "eval_num_tokens": 3254632.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1160
+ },
+ {
+ "entropy": 0.5414377674460411,
+ "epoch": 2.844390832328106,
+ "grad_norm": 0.47752824425697327,
+ "learning_rate": 0.0002134995505553327,
+ "loss": 0.4902364730834961,
+ "mean_token_accuracy": 0.8546424314379693,
+ "num_tokens": 3309169.0,
+ "step": 1180
+ },
+ {
+ "epoch": 2.844390832328106,
+ "eval_entropy": 0.5259190955188837,
+ "eval_loss": 0.5349776744842529,
+ "eval_mean_token_accuracy": 0.8489457506142305,
+ "eval_num_tokens": 3309169.0,
+ "eval_runtime": 55.2731,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1180
+ },
+ {
+ "entropy": 0.5215715534985066,
+ "epoch": 2.8926417370325694,
+ "grad_norm": 0.41028308868408203,
+ "learning_rate": 0.00021229074319664928,
+ "loss": 0.4762150287628174,
+ "mean_token_accuracy": 0.8578165486454964,
+ "num_tokens": 3365585.0,
+ "step": 1200
+ },
+ {
+ "epoch": 2.8926417370325694,
+ "eval_entropy": 0.5482661454530244,
+ "eval_loss": 0.5323300957679749,
+ "eval_mean_token_accuracy": 0.848629221487581,
+ "eval_num_tokens": 3365585.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 1200
+ },
+ {
+ "entropy": 0.5206520460546017,
+ "epoch": 2.9408926417370327,
+ "grad_norm": 0.5318732857704163,
+ "learning_rate": 0.00021105541566330375,
+ "loss": 0.4734694480895996,
+ "mean_token_accuracy": 0.8595159903168679,
+ "num_tokens": 3424180.0,
+ "step": 1220
+ },
+ {
+ "epoch": 2.9408926417370327,
+ "eval_entropy": 0.5315383620811313,
+ "eval_loss": 0.5288159251213074,
+ "eval_mean_token_accuracy": 0.8504117121187489,
+ "eval_num_tokens": 3424180.0,
+ "eval_runtime": 55.2871,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1220
+ },
+ {
+ "entropy": 0.5353466637432576,
+ "epoch": 2.989143546441496,
+ "grad_norm": 0.37469980120658875,
+ "learning_rate": 0.00020979391753837555,
+ "loss": 0.48825845718383787,
+ "mean_token_accuracy": 0.8551038816571236,
+ "num_tokens": 3478101.0,
+ "step": 1240
+ },
+ {
+ "epoch": 2.989143546441496,
+ "eval_entropy": 0.545977213074652,
+ "eval_loss": 0.5272343754768372,
+ "eval_mean_token_accuracy": 0.8505403610427728,
+ "eval_num_tokens": 3478101.0,
+ "eval_runtime": 55.2873,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1240
+ },
+ {
+ "entropy": 0.4871322696025555,
+ "epoch": 3.0361881785283473,
+ "grad_norm": 0.43063971400260925,
+ "learning_rate": 0.00020850660581091197,
+ "loss": 0.4392428398132324,
+ "mean_token_accuracy": 0.8684728237298819,
+ "num_tokens": 3533413.0,
+ "step": 1260
+ },
+ {
+ "epoch": 3.0361881785283473,
+ "eval_entropy": 0.47941737563422554,
+ "eval_loss": 0.5416561961174011,
+ "eval_mean_token_accuracy": 0.8500779602634773,
+ "eval_num_tokens": 3533413.0,
+ "eval_runtime": 55.2765,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1260
+ },
+ {
+ "entropy": 0.4550061285495758,
+ "epoch": 3.0844390832328106,
+ "grad_norm": 0.4342574179172516,
+ "learning_rate": 0.00020719384477490443,
+ "loss": 0.4091750144958496,
+ "mean_token_accuracy": 0.8743084371089935,
+ "num_tokens": 3594172.0,
+ "step": 1280
+ },
+ {
+ "epoch": 3.0844390832328106,
+ "eval_entropy": 0.5002224577611751,
+ "eval_loss": 0.5408567786216736,
+ "eval_mean_token_accuracy": 0.8497345280111506,
+ "eval_num_tokens": 3594172.0,
+ "eval_runtime": 55.278,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1280
+ },
+ {
+ "entropy": 0.47481488808989525,
+ "epoch": 3.132689987937274,
+ "grad_norm": 0.4471692740917206,
+ "learning_rate": 0.00020585600592619766,
+ "loss": 0.42618322372436523,
+ "mean_token_accuracy": 0.8696768507361412,
+ "num_tokens": 3654603.0,
+ "step": 1300
+ },
+ {
+ "epoch": 3.132689987937274,
+ "eval_entropy": 0.4904685912842161,
+ "eval_loss": 0.5353798866271973,
+ "eval_mean_token_accuracy": 0.8513298855068978,
+ "eval_num_tokens": 3654603.0,
+ "eval_runtime": 55.2946,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 1300
+ },
+ {
+ "entropy": 0.4613390527665615,
+ "epoch": 3.180940892641737,
+ "grad_norm": 0.435248464345932,
+ "learning_rate": 0.00020449346785736077,
+ "loss": 0.4190497398376465,
+ "mean_token_accuracy": 0.8722693488001824,
+ "num_tokens": 3718302.0,
+ "step": 1320
+ },
+ {
+ "epoch": 3.180940892641737,
+ "eval_entropy": 0.49038933937469226,
+ "eval_loss": 0.5339825749397278,
+ "eval_mean_token_accuracy": 0.8512222123949715,
+ "eval_num_tokens": 3718302.0,
+ "eval_runtime": 55.2777,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1320
+ },
+ {
+ "entropy": 0.46275881826877596,
+ "epoch": 3.2291917973462003,
+ "grad_norm": 0.5253846049308777,
+ "learning_rate": 0.00020310661615054987,
+ "loss": 0.4195539474487305,
+ "mean_token_accuracy": 0.8733018428087235,
+ "num_tokens": 3773964.0,
+ "step": 1340
+ },
+ {
+ "epoch": 3.2291917973462003,
+ "eval_entropy": 0.4907550284366929,
+ "eval_loss": 0.5359752178192139,
+ "eval_mean_token_accuracy": 0.8519132944305291,
+ "eval_num_tokens": 3773964.0,
+ "eval_runtime": 55.2579,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1340
+ },
+ {
+ "entropy": 0.478652510792017,
+ "epoch": 3.2774427020506636,
+ "grad_norm": 0.4682963788509369,
+ "learning_rate": 0.00020169584326839324,
+ "loss": 0.4342951774597168,
+ "mean_token_accuracy": 0.8700524374842644,
+ "num_tokens": 3831303.0,
+ "step": 1360
+ },
+ {
+ "epoch": 3.2774427020506636,
+ "eval_entropy": 0.5014148172032967,
+ "eval_loss": 0.5317310094833374,
+ "eval_mean_token_accuracy": 0.8515536684668465,
+ "eval_num_tokens": 3831303.0,
+ "eval_runtime": 55.2727,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1360
+ },
+ {
+ "entropy": 0.48046978786587713,
+ "epoch": 3.325693606755127,
+ "grad_norm": 0.5463082194328308,
+ "learning_rate": 0.0002002615484429286,
+ "loss": 0.4319463729858398,
+ "mean_token_accuracy": 0.866960471868515,
+ "num_tokens": 3883792.0,
+ "step": 1380
+ },
+ {
+ "epoch": 3.325693606755127,
+ "eval_entropy": 0.4927057652326112,
+ "eval_loss": 0.534314751625061,
+ "eval_mean_token_accuracy": 0.8516227616352982,
+ "eval_num_tokens": 3883792.0,
+ "eval_runtime": 55.292,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 1380
+ },
+ {
+ "entropy": 0.4725102625787258,
+ "epoch": 3.37394451145959,
+ "grad_norm": 0.5451153516769409,
+ "learning_rate": 0.00019880413756262559,
+ "loss": 0.42047967910766604,
+ "mean_token_accuracy": 0.8709015130996705,
+ "num_tokens": 3938525.0,
+ "step": 1400
+ },
+ {
+ "epoch": 3.37394451145959,
+ "eval_entropy": 0.4727198945337467,
+ "eval_loss": 0.5405450463294983,
+ "eval_mean_token_accuracy": 0.8502951676256201,
+ "eval_num_tokens": 3938525.0,
+ "eval_runtime": 55.2551,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 1400
+ },
+ {
+ "entropy": 0.46446300894021986,
+ "epoch": 3.422195416164053,
+ "grad_norm": 0.5143592357635498,
+ "learning_rate": 0.00019732402305752366,
+ "loss": 0.41742353439331054,
+ "mean_token_accuracy": 0.8726874738931656,
+ "num_tokens": 3992540.0,
+ "step": 1420
+ },
+ {
+ "epoch": 3.422195416164053,
+ "eval_entropy": 0.4988140174177256,
+ "eval_loss": 0.531399130821228,
+ "eval_mean_token_accuracy": 0.8521023335751523,
+ "eval_num_tokens": 3992540.0,
+ "eval_runtime": 55.2501,
+ "eval_samples_per_second": 25.701,
+ "eval_steps_per_second": 3.222,
+ "step": 1420
+ },
+ {
+ "entropy": 0.46348607912659645,
+ "epoch": 3.470446320868516,
+ "grad_norm": 0.4538668990135193,
+ "learning_rate": 0.00019582162378251983,
+ "loss": 0.41525859832763673,
+ "mean_token_accuracy": 0.8723404765129089,
+ "num_tokens": 4046628.0,
+ "step": 1440
+ },
+ {
+ "epoch": 3.470446320868516,
+ "eval_entropy": 0.49713132438364993,
+ "eval_loss": 0.5303418040275574,
+ "eval_mean_token_accuracy": 0.8517764990919092,
+ "eval_num_tokens": 4046628.0,
+ "eval_runtime": 55.2627,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 1440
+ },
+ {
+ "entropy": 0.4754491910338402,
+ "epoch": 3.5186972255729794,
+ "grad_norm": 0.5917999744415283,
+ "learning_rate": 0.00019429736489883723,
+ "loss": 0.42454705238342283,
+ "mean_token_accuracy": 0.8697500795125961,
+ "num_tokens": 4102962.0,
+ "step": 1460
+ },
+ {
+ "epoch": 3.5186972255729794,
+ "eval_entropy": 0.49202995219927154,
+ "eval_loss": 0.5311718583106995,
+ "eval_mean_token_accuracy": 0.851506720767932,
+ "eval_num_tokens": 4102962.0,
+ "eval_runtime": 55.2726,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 1460
+ },
+ {
+ "entropy": 0.48378537222743034,
+ "epoch": 3.5669481302774426,
+ "grad_norm": 0.5555779933929443,
+ "learning_rate": 0.00019275167775370967,
+ "loss": 0.4371222496032715,
+ "mean_token_accuracy": 0.8689292743802071,
+ "num_tokens": 4157529.0,
+ "step": 1480
+ },
+ {
+ "epoch": 3.5669481302774426,
+ "eval_entropy": 0.48221782820948056,
+ "eval_loss": 0.5330429077148438,
+ "eval_mean_token_accuracy": 0.8519372578417317,
+ "eval_num_tokens": 4157529.0,
+ "eval_runtime": 55.2868,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 1480
+ },
+ {
+ "entropy": 0.4857771031558514,
+ "epoch": 3.615199034981906,
+ "grad_norm": 0.4876253604888916,
+ "learning_rate": 0.00019118499975831547,
+ "loss": 0.43259029388427733,
+ "mean_token_accuracy": 0.8682043462991714,
+ "num_tokens": 4211781.0,
+ "step": 1500
+ },
+ {
+ "epoch": 3.615199034981906,
+ "eval_entropy": 0.4850948933470115,
+ "eval_loss": 0.5299601554870605,
+ "eval_mean_token_accuracy": 0.8528794411862834,
+ "eval_num_tokens": 4211781.0,
+ "eval_runtime": 55.2698,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 1500
+ },
+ {
+ "entropy": 0.4771463945508003,
+ "epoch": 3.663449939686369,
+ "grad_norm": 0.6399746537208557,
+ "learning_rate": 0.0001895977742639954,
+ "loss": 0.4305243968963623,
+ "mean_token_accuracy": 0.8694027632474899,
+ "num_tokens": 4263964.0,
+ "step": 1520
+ },
+ {
+ "epoch": 3.663449939686369,
+ "eval_entropy": 0.5143825498859534,
+ "eval_loss": 0.529093325138092,
+ "eval_mean_token_accuracy": 0.8517658697085434,
+ "eval_num_tokens": 4263964.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1520
+ },
+ {
+ "entropy": 0.474514215439558,
+ "epoch": 3.7117008443908324,
+ "grad_norm": 0.3690730631351471,
+ "learning_rate": 0.0001879904504367892,
+ "loss": 0.42963576316833496,
+ "mean_token_accuracy": 0.8720991492271424,
+ "num_tokens": 4321418.0,
+ "step": 1540
+ },
+ {
+ "epoch": 3.7117008443908324,
+ "eval_entropy": 0.5014224137817875,
+ "eval_loss": 0.5249797105789185,
+ "eval_mean_token_accuracy": 0.8529440539606502,
+ "eval_num_tokens": 4321418.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1540
+ },
+ {
+ "entropy": 0.47753885164856913,
+ "epoch": 3.7599517490952956,
+ "grad_norm": 0.4569166302680969,
+ "learning_rate": 0.0001863634831303272,
+ "loss": 0.4295980453491211,
+ "mean_token_accuracy": 0.8683714866638184,
+ "num_tokens": 4376266.0,
+ "step": 1560
+ },
+ {
+ "epoch": 3.7599517490952956,
+ "eval_entropy": 0.5019658906071374,
+ "eval_loss": 0.5246453285217285,
+ "eval_mean_token_accuracy": 0.8528274043222491,
+ "eval_num_tokens": 4376266.0,
+ "eval_runtime": 55.2623,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 1560
+ },
+ {
+ "entropy": 0.4871028944849968,
+ "epoch": 3.808202653799759,
+ "grad_norm": 0.5299158692359924,
+ "learning_rate": 0.00018471733275711197,
+ "loss": 0.43556942939758303,
+ "mean_token_accuracy": 0.8667704507708549,
+ "num_tokens": 4431183.0,
+ "step": 1580
+ },
+ {
+ "epoch": 3.808202653799759,
+ "eval_entropy": 0.49056559499729885,
+ "eval_loss": 0.5267402529716492,
+ "eval_mean_token_accuracy": 0.8529468617412481,
+ "eval_num_tokens": 4431183.0,
+ "eval_runtime": 55.2425,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 1580
+ },
+ {
+ "entropy": 0.48621814996004104,
+ "epoch": 3.856453558504222,
+ "grad_norm": 0.5417211055755615,
+ "learning_rate": 0.00018305246515822705,
+ "loss": 0.4356864929199219,
+ "mean_token_accuracy": 0.866399897634983,
+ "num_tokens": 4487602.0,
+ "step": 1600
+ },
+ {
+ "epoch": 3.856453558504222,
+ "eval_entropy": 0.4862406144985992,
+ "eval_loss": 0.528841495513916,
+ "eval_mean_token_accuracy": 0.8527089732416561,
+ "eval_num_tokens": 4487602.0,
+ "eval_runtime": 55.2578,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 1600
+ },
+ {
+ "entropy": 0.4812369517982006,
+ "epoch": 3.9047044632086854,
+ "grad_norm": 0.4729803204536438,
+ "learning_rate": 0.00018136935147150939,
+ "loss": 0.4373537540435791,
+ "mean_token_accuracy": 0.8668754518032074,
+ "num_tokens": 4544204.0,
+ "step": 1620
+ },
+ {
+ "epoch": 3.9047044632086854,
+ "eval_entropy": 0.49240104602963736,
+ "eval_loss": 0.5234901309013367,
+ "eval_mean_token_accuracy": 0.8531327271059658,
+ "eval_num_tokens": 4544204.0,
+ "eval_runtime": 55.2738,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1620
+ },
+ {
+ "entropy": 0.46881823167204856,
+ "epoch": 3.952955367913148,
+ "grad_norm": 0.6060121059417725,
+ "learning_rate": 0.00017966846799822304,
+ "loss": 0.4178919792175293,
+ "mean_token_accuracy": 0.873378013074398,
+ "num_tokens": 4601329.0,
+ "step": 1640
+ },
+ {
+ "epoch": 3.952955367913148,
+ "eval_entropy": 0.49871147733725857,
+ "eval_loss": 0.5233765244483948,
+ "eval_mean_token_accuracy": 0.8536087632848975,
+ "eval_num_tokens": 4601329.0,
+ "eval_runtime": 55.2741,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 1640
+ },
+ {
+ "entropy": 0.48498370402898544,
+ "epoch": 4.0,
+ "grad_norm": 2.052300214767456,
+ "learning_rate": 0.00017795029606827148,
+ "loss": 0.44376530647277834,
+ "mean_token_accuracy": 0.8655372567665882,
+ "num_tokens": 4654752.0,
+ "step": 1660
+ },
+ {
+ "epoch": 4.0,
+ "eval_entropy": 0.49232733165949916,
+ "eval_loss": 0.5222684741020203,
+ "eval_mean_token_accuracy": 0.8533160967773266,
+ "eval_num_tokens": 4654752.0,
+ "eval_runtime": 55.2756,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1660
+ },
+ {
+ "entropy": 0.4217598669230938,
+ "epoch": 4.048250904704463,
+ "grad_norm": 0.38481971621513367,
+ "learning_rate": 0.00017621532190398683,
+ "loss": 0.35839712619781494,
+ "mean_token_accuracy": 0.886622816324234,
+ "num_tokens": 4710934.0,
+ "step": 1680
+ },
+ {
+ "epoch": 4.048250904704463,
+ "eval_entropy": 0.4602571583530876,
+ "eval_loss": 0.5400357246398926,
+ "eval_mean_token_accuracy": 0.8520095864038789,
+ "eval_num_tokens": 4710934.0,
+ "eval_runtime": 55.2755,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 1680
+ },
+ {
+ "entropy": 0.415022599697113,
+ "epoch": 4.0965018094089265,
+ "grad_norm": 0.39883100986480713,
+ "learning_rate": 0.00017446403648253478,
+ "loss": 0.36227600574493407,
+ "mean_token_accuracy": 0.8860071450471878,
+ "num_tokens": 4768377.0,
+ "step": 1700
+ },
+ {
+ "epoch": 4.0965018094089265,
+ "eval_entropy": 0.45031814829687056,
+ "eval_loss": 0.5422973036766052,
+ "eval_mean_token_accuracy": 0.852367355917277,
+ "eval_num_tokens": 4768377.0,
+ "eval_runtime": 55.2798,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1700
+ },
+ {
+ "entropy": 0.40994974672794343,
+ "epoch": 4.144752714113389,
+ "grad_norm": 0.5344891548156738,
+ "learning_rate": 0.00017269693539697395,
+ "loss": 0.36210730075836184,
+ "mean_token_accuracy": 0.8847656399011612,
+ "num_tokens": 4826422.0,
+ "step": 1720
+ },
+ {
+ "epoch": 4.144752714113389,
+ "eval_entropy": 0.4408789911631788,
+ "eval_loss": 0.54451584815979,
+ "eval_mean_token_accuracy": 0.8525006958607877,
+ "eval_num_tokens": 4826422.0,
+ "eval_runtime": 55.2803,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 1720
+ },
+ {
+ "entropy": 0.4054971173405647,
+ "epoch": 4.193003618817853,
+ "grad_norm": 0.49729013442993164,
+ "learning_rate": 0.00017091451871600871,
+ "loss": 0.3581687927246094,
+ "mean_token_accuracy": 0.8884050786495209,
+ "num_tokens": 4883742.0,
+ "step": 1740
+ },
+ {
+ "epoch": 4.193003618817853,
+ "eval_entropy": 0.4421394058827604,
+ "eval_loss": 0.5438067317008972,
+ "eval_mean_token_accuracy": 0.8522373243664088,
+ "eval_num_tokens": 4883742.0,
+ "eval_runtime": 55.3273,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 1740
+ },
+ {
+ "entropy": 0.408981966227293,
+ "epoch": 4.241254523522316,
+ "grad_norm": 0.5137715339660645,
+ "learning_rate": 0.00016911729084247588,
+ "loss": 0.35566527843475343,
+ "mean_token_accuracy": 0.8870424538850784,
+ "num_tokens": 4939047.0,
+ "step": 1760
+ },
+ {
+ "epoch": 4.241254523522316,
+ "eval_entropy": 0.437940045856358,
+ "eval_loss": 0.5455656051635742,
+ "eval_mean_token_accuracy": 0.8521432715855287,
+ "eval_num_tokens": 4939047.0,
+ "eval_runtime": 55.3505,
+ "eval_samples_per_second": 25.655,
+ "eval_steps_per_second": 3.216,
+ "step": 1760
+ },
+ {
+ "entropy": 0.41717352643609046,
+ "epoch": 4.2895054282267795,
+ "grad_norm": 0.6072068214416504,
+ "learning_rate": 0.00016730576037060445,
+ "loss": 0.3726978778839111,
+ "mean_token_accuracy": 0.8857960850000381,
+ "num_tokens": 4995084.0,
+ "step": 1780
+ },
+ {
+ "epoch": 4.2895054282267795,
+ "eval_entropy": 0.4539307618743918,
+ "eval_loss": 0.5401874780654907,
+ "eval_mean_token_accuracy": 0.8514684432008294,
+ "eval_num_tokens": 4995084.0,
+ "eval_runtime": 55.3542,
+ "eval_samples_per_second": 25.653,
+ "eval_steps_per_second": 3.216,
+ "step": 1780
+ },
+ {
+ "entropy": 0.4127464734017849,
+ "epoch": 4.337756332931242,
+ "grad_norm": 0.5430658459663391,
+ "learning_rate": 0.00016548043994208964,
+ "loss": 0.3644162654876709,
+ "mean_token_accuracy": 0.8823994249105453,
+ "num_tokens": 5053068.0,
+ "step": 1800
+ },
+ {
+ "epoch": 4.337756332931242,
+ "eval_entropy": 0.44713982069090513,
+ "eval_loss": 0.5412749648094177,
+ "eval_mean_token_accuracy": 0.852779678414377,
+ "eval_num_tokens": 5053068.0,
+ "eval_runtime": 55.3611,
+ "eval_samples_per_second": 25.65,
+ "eval_steps_per_second": 3.215,
+ "step": 1800
+ },
+ {
+ "entropy": 0.41877189204096793,
+ "epoch": 4.386007237635706,
+ "grad_norm": 0.5927475690841675,
+ "learning_rate": 0.0001636418461010213,
+ "loss": 0.3683622360229492,
+ "mean_token_accuracy": 0.8833754420280456,
+ "num_tokens": 5104761.0,
+ "step": 1820
+ },
+ {
+ "epoch": 4.386007237635706,
+ "eval_entropy": 0.46290029468161337,
+ "eval_loss": 0.5373201966285706,
+ "eval_mean_token_accuracy": 0.8519754731253292,
+ "eval_num_tokens": 5104761.0,
+ "eval_runtime": 55.366,
+ "eval_samples_per_second": 25.647,
+ "eval_steps_per_second": 3.215,
+ "step": 1820
+ },
+ {
+ "entropy": 0.41361497789621354,
+ "epoch": 4.434258142340169,
+ "grad_norm": 0.42524710297584534,
+ "learning_rate": 0.0001617904991477079,
+ "loss": 0.36388933658599854,
+ "mean_token_accuracy": 0.8857789531350135,
+ "num_tokens": 5160976.0,
+ "step": 1840
+ },
+ {
+ "epoch": 4.434258142340169,
+ "eval_entropy": 0.454606260811345,
+ "eval_loss": 0.5357740521430969,
+ "eval_mean_token_accuracy": 0.8534008217661568,
+ "eval_num_tokens": 5160976.0,
+ "eval_runtime": 55.3257,
+ "eval_samples_per_second": 25.666,
+ "eval_steps_per_second": 3.217,
+ "step": 1840
+ },
+ {
+ "entropy": 0.4072607338428497,
+ "epoch": 4.4825090470446325,
+ "grad_norm": 0.5685698390007019,
+ "learning_rate": 0.00015992692299143796,
+ "loss": 0.36304988861083987,
+ "mean_token_accuracy": 0.8849645286798478,
+ "num_tokens": 5220453.0,
+ "step": 1860
+ },
+ {
+ "epoch": 4.4825090470446325,
+ "eval_entropy": 0.45586093925358206,
+ "eval_loss": 0.5363942980766296,
+ "eval_mean_token_accuracy": 0.8529877264178201,
+ "eval_num_tokens": 5220453.0,
+ "eval_runtime": 55.3204,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 1860
+ },
+ {
+ "entropy": 0.41214886382222177,
+ "epoch": 4.530759951749095,
+ "grad_norm": 0.5072870254516602,
+ "learning_rate": 0.00015805164500221977,
+ "loss": 0.3678156852722168,
+ "mean_token_accuracy": 0.8866965472698212,
+ "num_tokens": 5276552.0,
+ "step": 1880
+ },
+ {
+ "epoch": 4.530759951749095,
+ "eval_entropy": 0.45211532125982007,
+ "eval_loss": 0.5386014580726624,
+ "eval_mean_token_accuracy": 0.8531195494566071,
+ "eval_num_tokens": 5276552.0,
+ "eval_runtime": 55.3385,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1880
+ },
+ {
+ "entropy": 0.4329418152570724,
+ "epoch": 4.579010856453559,
+ "grad_norm": 0.5007720589637756,
+ "learning_rate": 0.00015616519586154177,
+ "loss": 0.38508875370025636,
+ "mean_token_accuracy": 0.880007627606392,
+ "num_tokens": 5329807.0,
+ "step": 1900
+ },
+ {
+ "epoch": 4.579010856453559,
+ "eval_entropy": 0.4555856212136451,
+ "eval_loss": 0.5313775539398193,
+ "eval_mean_token_accuracy": 0.8542753206879905,
+ "eval_num_tokens": 5329807.0,
+ "eval_runtime": 55.3792,
+ "eval_samples_per_second": 25.641,
+ "eval_steps_per_second": 3.214,
+ "step": 1900
+ },
+ {
+ "entropy": 0.40598013177514075,
+ "epoch": 4.627261761158022,
+ "grad_norm": 0.561281144618988,
+ "learning_rate": 0.00015426810941219628,
+ "loss": 0.35770084857940676,
+ "mean_token_accuracy": 0.8866653427481651,
+ "num_tokens": 5387697.0,
+ "step": 1920
+ },
+ {
+ "epoch": 4.627261761158022,
+ "eval_entropy": 0.4488721307408944,
+ "eval_loss": 0.5346855521202087,
+ "eval_mean_token_accuracy": 0.8540940257940399,
+ "eval_num_tokens": 5387697.0,
+ "eval_runtime": 55.342,
+ "eval_samples_per_second": 25.659,
+ "eval_steps_per_second": 3.216,
+ "step": 1920
+ },
+ {
+ "entropy": 0.42046323865652085,
+ "epoch": 4.675512665862485,
+ "grad_norm": 0.532781183719635,
+ "learning_rate": 0.0001523609225072081,
+ "loss": 0.3753895044326782,
+ "mean_token_accuracy": 0.8846079766750335,
+ "num_tokens": 5442617.0,
+ "step": 1940
+ },
+ {
+ "epoch": 4.675512665862485,
+ "eval_entropy": 0.4516103354732642,
+ "eval_loss": 0.5331831574440002,
+ "eval_mean_token_accuracy": 0.8537958291139496,
+ "eval_num_tokens": 5442617.0,
+ "eval_runtime": 55.3381,
+ "eval_samples_per_second": 25.66,
+ "eval_steps_per_second": 3.217,
+ "step": 1940
+ },
+ {
+ "entropy": 0.4248813711106777,
+ "epoch": 4.723763570566948,
+ "grad_norm": 0.573076069355011,
+ "learning_rate": 0.0001504441748579115,
+ "loss": 0.37468433380126953,
+ "mean_token_accuracy": 0.8810012340545654,
+ "num_tokens": 5498376.0,
+ "step": 1960
+ },
+ {
+ "epoch": 4.723763570566948,
+ "eval_entropy": 0.45351154650195263,
+ "eval_loss": 0.5313496589660645,
+ "eval_mean_token_accuracy": 0.8540389561251308,
+ "eval_num_tokens": 5498376.0,
+ "eval_runtime": 55.314,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 1960
+ },
+ {
+ "entropy": 0.42766154184937477,
+ "epoch": 4.772014475271411,
+ "grad_norm": 0.6360363960266113,
+ "learning_rate": 0.0001485184088812183,
+ "loss": 0.3759138584136963,
+ "mean_token_accuracy": 0.8832121655344963,
+ "num_tokens": 5550617.0,
+ "step": 1980
+ },
+ {
+ "epoch": 4.772014475271411,
+ "eval_entropy": 0.45847221890862067,
+ "eval_loss": 0.5307087302207947,
+ "eval_mean_token_accuracy": 0.8554085097955854,
+ "eval_num_tokens": 5550617.0,
+ "eval_runtime": 55.2786,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 1980
+ },
+ {
+ "entropy": 0.43023054748773576,
+ "epoch": 4.820265379975875,
+ "grad_norm": 0.5118623971939087,
+ "learning_rate": 0.00014658416954612026,
+ "loss": 0.3791791915893555,
+ "mean_token_accuracy": 0.8808756858110428,
+ "num_tokens": 5606610.0,
+ "step": 2000
+ },
+ {
+ "epoch": 4.820265379975875,
+ "eval_entropy": 0.44867819112338375,
+ "eval_loss": 0.5335640907287598,
+ "eval_mean_token_accuracy": 0.8539478557833126,
+ "eval_num_tokens": 5606610.0,
+ "eval_runtime": 55.2773,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2000
+ },
+ {
+ "entropy": 0.4132154494524002,
+ "epoch": 4.868516284680338,
+ "grad_norm": 0.507412850856781,
+ "learning_rate": 0.00014464200421946937,
+ "loss": 0.3699758768081665,
+ "mean_token_accuracy": 0.8830380782485008,
+ "num_tokens": 5665904.0,
+ "step": 2020
+ },
+ {
+ "epoch": 4.868516284680338,
+ "eval_entropy": 0.447213868579168,
+ "eval_loss": 0.5322631597518921,
+ "eval_mean_token_accuracy": 0.8543167797367225,
+ "eval_num_tokens": 5665904.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2020
+ },
+ {
+ "entropy": 0.42996141090989115,
+ "epoch": 4.916767189384801,
+ "grad_norm": 0.5200228095054626,
+ "learning_rate": 0.00014269246251107944,
+ "loss": 0.3860961675643921,
+ "mean_token_accuracy": 0.8778340086340904,
+ "num_tokens": 5719363.0,
+ "step": 2040
+ },
+ {
+ "epoch": 4.916767189384801,
+ "eval_entropy": 0.4490728845422188,
+ "eval_loss": 0.5343455076217651,
+ "eval_mean_token_accuracy": 0.854227306132906,
+ "eval_num_tokens": 5719363.0,
+ "eval_runtime": 55.2678,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2040
+ },
+ {
+ "entropy": 0.42904197499156,
+ "epoch": 4.965018094089264,
+ "grad_norm": 0.679582417011261,
+ "learning_rate": 0.0001407360961181932,
+ "loss": 0.38086695671081544,
+ "mean_token_accuracy": 0.8815151125192642,
+ "num_tokens": 5772021.0,
+ "step": 2060
+ },
+ {
+ "epoch": 4.965018094089264,
+ "eval_entropy": 0.45351302489805756,
+ "eval_loss": 0.5304082036018372,
+ "eval_mean_token_accuracy": 0.8544826795545857,
+ "eval_num_tokens": 5772021.0,
+ "eval_runtime": 55.2887,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2060
+ },
+ {
+ "entropy": 0.3950294531308688,
+ "epoch": 5.012062726176116,
+ "grad_norm": 0.4820767641067505,
+ "learning_rate": 0.00013877345866935813,
+ "loss": 0.34286065101623536,
+ "mean_token_accuracy": 0.891666068480565,
+ "num_tokens": 5831937.0,
+ "step": 2080
+ },
+ {
+ "epoch": 5.012062726176116,
+ "eval_entropy": 0.40521935934431097,
+ "eval_loss": 0.5571053624153137,
+ "eval_mean_token_accuracy": 0.8532105804829115,
+ "eval_num_tokens": 5831937.0,
+ "eval_runtime": 55.2721,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2080
+ },
+ {
+ "entropy": 0.3518491223454475,
+ "epoch": 5.060313630880579,
+ "grad_norm": 0.5651171803474426,
+ "learning_rate": 0.00013680510556775657,
+ "loss": 0.2968994855880737,
+ "mean_token_accuracy": 0.9036217927932739,
+ "num_tokens": 5885628.0,
+ "step": 2100
+ },
+ {
+ "epoch": 5.060313630880579,
+ "eval_entropy": 0.3908112795835131,
+ "eval_loss": 0.5718717575073242,
+ "eval_mean_token_accuracy": 0.8517157733440399,
+ "eval_num_tokens": 5885628.0,
+ "eval_runtime": 55.3121,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2100
+ },
+ {
+ "entropy": 0.3508297473192215,
+ "epoch": 5.108564535585042,
+ "grad_norm": 0.7483569979667664,
+ "learning_rate": 0.00013483159383403286,
+ "loss": 0.2943051815032959,
+ "mean_token_accuracy": 0.9050277039408684,
+ "num_tokens": 5938273.0,
+ "step": 2120
+ },
+ {
+ "epoch": 5.108564535585042,
+ "eval_entropy": 0.3922154439634152,
+ "eval_loss": 0.5800208449363708,
+ "eval_mean_token_accuracy": 0.849590765626243,
+ "eval_num_tokens": 5938273.0,
+ "eval_runtime": 55.3062,
+ "eval_samples_per_second": 25.675,
+ "eval_steps_per_second": 3.218,
+ "step": 2120
+ },
+ {
+ "entropy": 0.3514078348875046,
+ "epoch": 5.156815440289505,
+ "grad_norm": 0.5905992388725281,
+ "learning_rate": 0.00013285348194866324,
+ "loss": 0.30216853618621825,
+ "mean_token_accuracy": 0.9038319244980813,
+ "num_tokens": 5994067.0,
+ "step": 2140
+ },
+ {
+ "epoch": 5.156815440289505,
+ "eval_entropy": 0.40027610237678785,
+ "eval_loss": 0.5710272789001465,
+ "eval_mean_token_accuracy": 0.8524003802390581,
+ "eval_num_tokens": 5994067.0,
+ "eval_runtime": 55.3115,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 2140
+ },
+ {
+ "entropy": 0.36622492372989657,
+ "epoch": 5.205066344993969,
+ "grad_norm": 0.4692232012748718,
+ "learning_rate": 0.00013087132969391246,
+ "loss": 0.307849645614624,
+ "mean_token_accuracy": 0.9014044284820557,
+ "num_tokens": 6047358.0,
+ "step": 2160
+ },
+ {
+ "epoch": 5.205066344993969,
+ "eval_entropy": 0.4034242611587717,
+ "eval_loss": 0.5697806477546692,
+ "eval_mean_token_accuracy": 0.8520179243569963,
+ "eval_num_tokens": 6047358.0,
+ "eval_runtime": 55.2722,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 2160
+ },
+ {
+ "entropy": 0.3508648067712784,
+ "epoch": 5.253317249698432,
+ "grad_norm": 0.7615344524383545,
+ "learning_rate": 0.0001288856979954221,
+ "loss": 0.29553372859954835,
+ "mean_token_accuracy": 0.9034190520644187,
+ "num_tokens": 6106644.0,
+ "step": 2180
+ },
+ {
+ "epoch": 5.253317249698432,
+ "eval_entropy": 0.3921829242719693,
+ "eval_loss": 0.5691862106323242,
+ "eval_mean_token_accuracy": 0.8526006635655178,
+ "eval_num_tokens": 6106644.0,
+ "eval_runtime": 55.3002,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 2180
+ },
+ {
+ "entropy": 0.3519009813666344,
+ "epoch": 5.301568154402895,
+ "grad_norm": 0.7679558992385864,
+ "learning_rate": 0.00012689714876347493,
+ "loss": 0.29419128894805907,
+ "mean_token_accuracy": 0.9031866028904915,
+ "num_tokens": 6160311.0,
+ "step": 2200
+ },
+ {
+ "epoch": 5.301568154402895,
+ "eval_entropy": 0.39013911599523565,
+ "eval_loss": 0.5767874121665955,
+ "eval_mean_token_accuracy": 0.8514098655641749,
+ "eval_num_tokens": 6160311.0,
+ "eval_runtime": 55.2892,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2200
+ },
+ {
+ "entropy": 0.34301606491208075,
+ "epoch": 5.349819059107358,
+ "grad_norm": 0.6707648634910583,
+ "learning_rate": 0.0001249062447339814,
+ "loss": 0.29520268440246583,
+ "mean_token_accuracy": 0.903484332561493,
+ "num_tokens": 6218747.0,
+ "step": 2220
+ },
+ {
+ "epoch": 5.349819059107358,
+ "eval_entropy": 0.3892528228880314,
+ "eval_loss": 0.5764396786689758,
+ "eval_mean_token_accuracy": 0.8511234943786364,
+ "eval_num_tokens": 6218747.0,
+ "eval_runtime": 55.2573,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2220
+ },
+ {
+ "entropy": 0.35039012134075165,
+ "epoch": 5.398069963811821,
+ "grad_norm": 0.64646315574646,
+ "learning_rate": 0.00012291354930923175,
+ "loss": 0.3015883207321167,
+ "mean_token_accuracy": 0.9020207405090332,
+ "num_tokens": 6274591.0,
+ "step": 2240
+ },
+ {
+ "epoch": 5.398069963811821,
+ "eval_entropy": 0.39754635162567825,
+ "eval_loss": 0.5652341842651367,
+ "eval_mean_token_accuracy": 0.8526410135660278,
+ "eval_num_tokens": 6274591.0,
+ "eval_runtime": 55.2883,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.219,
+ "step": 2240
+ },
+ {
+ "entropy": 0.3538553349673748,
+ "epoch": 5.446320868516285,
+ "grad_norm": 0.760821521282196,
+ "learning_rate": 0.00012091962639845982,
+ "loss": 0.303299617767334,
+ "mean_token_accuracy": 0.9028412505984307,
+ "num_tokens": 6328661.0,
+ "step": 2260
+ },
+ {
+ "epoch": 5.446320868516285,
+ "eval_entropy": 0.39631325575742826,
+ "eval_loss": 0.5641883611679077,
+ "eval_mean_token_accuracy": 0.8536281538813302,
+ "eval_num_tokens": 6328661.0,
+ "eval_runtime": 55.3024,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 2260
+ },
+ {
+ "entropy": 0.3555382125079632,
+ "epoch": 5.4945717732207475,
+ "grad_norm": 0.6750470399856567,
+ "learning_rate": 0.00011892504025826358,
+ "loss": 0.30209062099456785,
+ "mean_token_accuracy": 0.9013994172215462,
+ "num_tokens": 6383970.0,
+ "step": 2280
+ },
+ {
+ "epoch": 5.4945717732207475,
+ "eval_entropy": 0.39563166810555406,
+ "eval_loss": 0.5636888146400452,
+ "eval_mean_token_accuracy": 0.8531888388515858,
+ "eval_num_tokens": 6383970.0,
+ "eval_runtime": 55.2944,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2280
+ },
+ {
+ "entropy": 0.3454573631286621,
+ "epoch": 5.542822677925211,
+ "grad_norm": 0.7239598035812378,
+ "learning_rate": 0.00011693035533292696,
+ "loss": 0.29387383460998534,
+ "mean_token_accuracy": 0.9050945967435837,
+ "num_tokens": 6442499.0,
+ "step": 2300
+ },
+ {
+ "epoch": 5.542822677925211,
+ "eval_entropy": 0.392980629306161,
+ "eval_loss": 0.5626416802406311,
+ "eval_mean_token_accuracy": 0.8542174549584978,
+ "eval_num_tokens": 6442499.0,
+ "eval_runtime": 55.2624,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 2300
+ },
+ {
+ "entropy": 0.3474574498832226,
+ "epoch": 5.591073582629674,
+ "grad_norm": 0.6282244324684143,
+ "learning_rate": 0.00011493613609468904,
+ "loss": 0.3063398599624634,
+ "mean_token_accuracy": 0.9010455697774887,
+ "num_tokens": 6503184.0,
+ "step": 2320
+ },
+ {
+ "epoch": 5.591073582629674,
+ "eval_entropy": 0.4008133021298419,
+ "eval_loss": 0.5614317655563354,
+ "eval_mean_token_accuracy": 0.8536901155884347,
+ "eval_num_tokens": 6503184.0,
+ "eval_runtime": 55.289,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2320
+ },
+ {
+ "entropy": 0.3517454981803894,
+ "epoch": 5.639324487334138,
+ "grad_norm": 0.5463727116584778,
+ "learning_rate": 0.00011294294688400486,
+ "loss": 0.30131995677948,
+ "mean_token_accuracy": 0.9020274326205253,
+ "num_tokens": 6562777.0,
+ "step": 2340
+ },
+ {
+ "epoch": 5.639324487334138,
+ "eval_entropy": 0.4002562404683467,
+ "eval_loss": 0.5606418251991272,
+ "eval_mean_token_accuracy": 0.8533886798312155,
+ "eval_num_tokens": 6562777.0,
+ "eval_runtime": 55.2831,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 2340
+ },
+ {
+ "entropy": 0.35189504325389864,
+ "epoch": 5.6875753920386005,
+ "grad_norm": 0.6368454098701477,
+ "learning_rate": 0.00011095135174984394,
+ "loss": 0.3063028812408447,
+ "mean_token_accuracy": 0.902279743552208,
+ "num_tokens": 6622512.0,
+ "step": 2360
+ },
+ {
+ "epoch": 5.6875753920386005,
+ "eval_entropy": 0.4084411474426141,
+ "eval_loss": 0.55985027551651,
+ "eval_mean_token_accuracy": 0.8527230613687066,
+ "eval_num_tokens": 6622512.0,
+ "eval_runtime": 55.2785,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2360
+ },
+ {
+ "entropy": 0.35596260875463487,
+ "epoch": 5.735826296743064,
+ "grad_norm": 0.5473037362098694,
+ "learning_rate": 0.00010896191429007085,
+ "loss": 0.30281500816345214,
+ "mean_token_accuracy": 0.9030452728271484,
+ "num_tokens": 6676643.0,
+ "step": 2380
+ },
+ {
+ "epoch": 5.735826296743064,
+ "eval_entropy": 0.39605340093709107,
+ "eval_loss": 0.5619694590568542,
+ "eval_mean_token_accuracy": 0.8537970248902782,
+ "eval_num_tokens": 6676643.0,
+ "eval_runtime": 55.2458,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2380
+ },
+ {
+ "entropy": 0.3503343403339386,
+ "epoch": 5.784077201447527,
+ "grad_norm": 0.5703344941139221,
+ "learning_rate": 0.00010697519749195404,
+ "loss": 0.30224013328552246,
+ "mean_token_accuracy": 0.901669493317604,
+ "num_tokens": 6733574.0,
+ "step": 2400
+ },
+ {
+ "epoch": 5.784077201447527,
+ "eval_entropy": 0.40314525566744003,
+ "eval_loss": 0.5587471723556519,
+ "eval_mean_token_accuracy": 0.8520790084024493,
+ "eval_num_tokens": 6733574.0,
+ "eval_runtime": 55.2455,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 2400
+ },
+ {
+ "entropy": 0.34604763686656953,
+ "epoch": 5.832328106151991,
+ "grad_norm": 0.6742275953292847,
+ "learning_rate": 0.00010499176357284669,
+ "loss": 0.30106277465820314,
+ "mean_token_accuracy": 0.9020058959722519,
+ "num_tokens": 6790549.0,
+ "step": 2420
+ },
+ {
+ "epoch": 5.832328106151991,
+ "eval_entropy": 0.3899915837839748,
+ "eval_loss": 0.5612479448318481,
+ "eval_mean_token_accuracy": 0.8548604069131144,
+ "eval_num_tokens": 6790549.0,
+ "eval_runtime": 55.3052,
+ "eval_samples_per_second": 25.676,
+ "eval_steps_per_second": 3.219,
+ "step": 2420
+ },
+ {
+ "entropy": 0.3510720990598202,
+ "epoch": 5.8805790108564535,
+ "grad_norm": 0.6276938319206238,
+ "learning_rate": 0.00010301217382108624,
+ "loss": 0.3025418043136597,
+ "mean_token_accuracy": 0.9027798771858215,
+ "num_tokens": 6845043.0,
+ "step": 2440
+ },
+ {
+ "epoch": 5.8805790108564535,
+ "eval_entropy": 0.3858113387662373,
+ "eval_loss": 0.5640541315078735,
+ "eval_mean_token_accuracy": 0.8541722240742673,
+ "eval_num_tokens": 6845043.0,
+ "eval_runtime": 55.329,
+ "eval_samples_per_second": 25.665,
+ "eval_steps_per_second": 3.217,
+ "step": 2440
+ },
+ {
+ "entropy": 0.3422310143709183,
+ "epoch": 5.928829915560916,
+ "grad_norm": 0.6075900197029114,
+ "learning_rate": 0.00010103698843715608,
+ "loss": 0.2961073160171509,
+ "mean_token_accuracy": 0.9035829156637192,
+ "num_tokens": 6899831.0,
+ "step": 2460
+ },
+ {
+ "epoch": 5.928829915560916,
+ "eval_entropy": 0.3931033756960644,
+ "eval_loss": 0.5563910603523254,
+ "eval_mean_token_accuracy": 0.8541168610701401,
+ "eval_num_tokens": 6899831.0,
+ "eval_runtime": 55.3187,
+ "eval_samples_per_second": 25.669,
+ "eval_steps_per_second": 3.218,
+ "step": 2460
+ },
+ {
+ "entropy": 0.34767043516039847,
+ "epoch": 5.97708082026538,
+ "grad_norm": 0.7821509838104248,
+ "learning_rate": 9.906676637515565e-05,
+ "loss": 0.2965876579284668,
+ "mean_token_accuracy": 0.9043371796607971,
+ "num_tokens": 6956048.0,
+ "step": 2480
+ },
+ {
+ "epoch": 5.97708082026538,
+ "eval_entropy": 0.3876880623316497,
+ "eval_loss": 0.5590454936027527,
+ "eval_mean_token_accuracy": 0.8546800589963292,
+ "eval_num_tokens": 6956048.0,
+ "eval_runtime": 55.3153,
+ "eval_samples_per_second": 25.671,
+ "eval_steps_per_second": 3.218,
+ "step": 2480
+ },
+ {
+ "entropy": 0.30925769530809843,
+ "epoch": 6.024125452352232,
+ "grad_norm": 0.8434519171714783,
+ "learning_rate": 9.71020651846231e-05,
+ "loss": 0.2541666507720947,
+ "mean_token_accuracy": 0.9174298537083161,
+ "num_tokens": 7014287.0,
+ "step": 2500
+ },
+ {
+ "epoch": 6.024125452352232,
+ "eval_entropy": 0.33969055735663084,
+ "eval_loss": 0.6305586099624634,
+ "eval_mean_token_accuracy": 0.8488262406225955,
+ "eval_num_tokens": 7014287.0,
+ "eval_runtime": 55.299,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 2500
+ },
+ {
+ "entropy": 0.27741119749844073,
+ "epoch": 6.072376357056695,
+ "grad_norm": 0.7920182943344116,
+ "learning_rate": 9.514344085275508e-05,
+ "loss": 0.21898913383483887,
+ "mean_token_accuracy": 0.9291795000433922,
+ "num_tokens": 7067527.0,
+ "step": 2520
+ },
+ {
+ "epoch": 6.072376357056695,
+ "eval_entropy": 0.3445688337087631,
+ "eval_loss": 0.613576352596283,
+ "eval_mean_token_accuracy": 0.8510002580921302,
+ "eval_num_tokens": 7067527.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2520
+ },
+ {
+ "entropy": 0.2773283515125513,
+ "epoch": 6.120627261761158,
+ "grad_norm": 0.7460144758224487,
+ "learning_rate": 9.31914476470693e-05,
+ "loss": 0.2153709650039673,
+ "mean_token_accuracy": 0.929512245953083,
+ "num_tokens": 7120104.0,
+ "step": 2540
+ },
+ {
+ "epoch": 6.120627261761158,
+ "eval_entropy": 0.3431067659278934,
+ "eval_loss": 0.6223914623260498,
+ "eval_mean_token_accuracy": 0.8489464427647966,
+ "eval_num_tokens": 7120104.0,
+ "eval_runtime": 55.3127,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 2540
+ },
+ {
+ "entropy": 0.28125015236437323,
+ "epoch": 6.168878166465621,
+ "grad_norm": 0.6596947908401489,
+ "learning_rate": 9.124663795855309e-05,
+ "loss": 0.22263822555541993,
+ "mean_token_accuracy": 0.9263416901230812,
+ "num_tokens": 7173775.0,
+ "step": 2560
+ },
+ {
+ "epoch": 6.168878166465621,
+ "eval_entropy": 0.3495907724908229,
+ "eval_loss": 0.6132948994636536,
+ "eval_mean_token_accuracy": 0.8493377057354102,
+ "eval_num_tokens": 7173775.0,
+ "eval_runtime": 55.2764,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 2560
+ },
+ {
+ "entropy": 0.2749589327722788,
+ "epoch": 6.217129071170085,
+ "grad_norm": 0.6811111569404602,
+ "learning_rate": 8.930956214534336e-05,
+ "loss": 0.2155080556869507,
+ "mean_token_accuracy": 0.9284482330083847,
+ "num_tokens": 7229823.0,
+ "step": 2580
+ },
+ {
+ "epoch": 6.217129071170085,
+ "eval_entropy": 0.3410603646816832,
+ "eval_loss": 0.6241295337677002,
+ "eval_mean_token_accuracy": 0.8500961688127411,
+ "eval_num_tokens": 7229823.0,
+ "eval_runtime": 55.2676,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 2580
+ },
+ {
+ "entropy": 0.2811116900295019,
+ "epoch": 6.265379975874548,
+ "grad_norm": 0.7649447917938232,
+ "learning_rate": 8.738076837698193e-05,
+ "loss": 0.2263277769088745,
+ "mean_token_accuracy": 0.9255106285214424,
+ "num_tokens": 7287356.0,
+ "step": 2600
+ },
+ {
+ "epoch": 6.265379975874548,
+ "eval_entropy": 0.34441549171892444,
+ "eval_loss": 0.6095326542854309,
+ "eval_mean_token_accuracy": 0.8520389209302623,
+ "eval_num_tokens": 7287356.0,
+ "eval_runtime": 55.2635,
+ "eval_samples_per_second": 25.695,
+ "eval_steps_per_second": 3.221,
+ "step": 2600
+ },
+ {
+ "entropy": 0.27435422986745833,
+ "epoch": 6.3136308805790105,
+ "grad_norm": 0.6525525450706482,
+ "learning_rate": 8.546080247928975e-05,
+ "loss": 0.2196337938308716,
+ "mean_token_accuracy": 0.9278794303536415,
+ "num_tokens": 7345818.0,
+ "step": 2620
+ },
+ {
+ "epoch": 6.3136308805790105,
+ "eval_entropy": 0.3514235028055277,
+ "eval_loss": 0.6111680865287781,
+ "eval_mean_token_accuracy": 0.8514190537206242,
+ "eval_num_tokens": 7345818.0,
+ "eval_runtime": 55.2942,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 2620
+ },
+ {
+ "entropy": 0.27057958133518695,
+ "epoch": 6.361881785283474,
+ "grad_norm": 0.7869387269020081,
+ "learning_rate": 8.355020777990492e-05,
+ "loss": 0.21130192279815674,
+ "mean_token_accuracy": 0.9296034276485443,
+ "num_tokens": 7406978.0,
+ "step": 2640
+ },
+ {
+ "epoch": 6.361881785283474,
+ "eval_entropy": 0.3361843256803041,
+ "eval_loss": 0.6246429681777954,
+ "eval_mean_token_accuracy": 0.8500659284966715,
+ "eval_num_tokens": 7406978.0,
+ "eval_runtime": 55.2901,
+ "eval_samples_per_second": 25.683,
+ "eval_steps_per_second": 3.219,
+ "step": 2640
+ },
+ {
+ "entropy": 0.2786216359585524,
+ "epoch": 6.410132689987937,
+ "grad_norm": 0.7876622676849365,
+ "learning_rate": 8.164952495452717e-05,
+ "loss": 0.2234494209289551,
+ "mean_token_accuracy": 0.9263992309570312,
+ "num_tokens": 7462212.0,
+ "step": 2660
+ },
+ {
+ "epoch": 6.410132689987937,
+ "eval_entropy": 0.3445214969053697,
+ "eval_loss": 0.6168352961540222,
+ "eval_mean_token_accuracy": 0.8508294469185089,
+ "eval_num_tokens": 7462212.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 2660
+ },
+ {
+ "entropy": 0.27971002720296384,
+ "epoch": 6.458383594692401,
+ "grad_norm": 0.7404258847236633,
+ "learning_rate": 7.975929187391317e-05,
+ "loss": 0.22412145137786865,
+ "mean_token_accuracy": 0.9254573807120323,
+ "num_tokens": 7519697.0,
+ "step": 2680
+ },
+ {
+ "epoch": 6.458383594692401,
+ "eval_entropy": 0.3495463337121385,
+ "eval_loss": 0.6165894269943237,
+ "eval_mean_token_accuracy": 0.8501586800210932,
+ "eval_num_tokens": 7519697.0,
+ "eval_runtime": 55.2778,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 2680
+ },
+ {
+ "entropy": 0.28011396154761314,
+ "epoch": 6.5066344993968634,
+ "grad_norm": 0.6830134391784668,
+ "learning_rate": 7.788004345166545e-05,
+ "loss": 0.22303051948547364,
+ "mean_token_accuracy": 0.9269484728574753,
+ "num_tokens": 7574834.0,
+ "step": 2700
+ },
+ {
+ "epoch": 6.5066344993968634,
+ "eval_entropy": 0.34271225785271503,
+ "eval_loss": 0.615679144859314,
+ "eval_mean_token_accuracy": 0.8512428282351976,
+ "eval_num_tokens": 7574834.0,
+ "eval_runtime": 55.2417,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2700
+ },
+ {
+ "entropy": 0.27497340776026247,
+ "epoch": 6.554885404101327,
+ "grad_norm": 0.6799295544624329,
+ "learning_rate": 7.601231149285811e-05,
+ "loss": 0.2222221851348877,
+ "mean_token_accuracy": 0.9258430942893028,
+ "num_tokens": 7627577.0,
+ "step": 2720
+ },
+ {
+ "epoch": 6.554885404101327,
+ "eval_entropy": 0.34204172083501067,
+ "eval_loss": 0.6145843863487244,
+ "eval_mean_token_accuracy": 0.851729148559356,
+ "eval_num_tokens": 7627577.0,
+ "eval_runtime": 55.254,
+ "eval_samples_per_second": 25.699,
+ "eval_steps_per_second": 3.221,
+ "step": 2720
+ },
+ {
+ "entropy": 0.27879350669682024,
+ "epoch": 6.60313630880579,
+ "grad_norm": 0.7392159700393677,
+ "learning_rate": 7.41566245435424e-05,
+ "loss": 0.22497382164001464,
+ "mean_token_accuracy": 0.9250957772135735,
+ "num_tokens": 7682457.0,
+ "step": 2740
+ },
+ {
+ "epoch": 6.60313630880579,
+ "eval_entropy": 0.34408896290854124,
+ "eval_loss": 0.6155394315719604,
+ "eval_mean_token_accuracy": 0.8512654702984885,
+ "eval_num_tokens": 7682457.0,
+ "eval_runtime": 55.2601,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2740
+ },
+ {
+ "entropy": 0.29418626725673674,
+ "epoch": 6.651387213510254,
+ "grad_norm": 1.0366321802139282,
+ "learning_rate": 7.23135077411743e-05,
+ "loss": 0.23374652862548828,
+ "mean_token_accuracy": 0.9242533966898918,
+ "num_tokens": 7735055.0,
+ "step": 2760
+ },
+ {
+ "epoch": 6.651387213510254,
+ "eval_entropy": 0.34456085924352153,
+ "eval_loss": 0.6196476817131042,
+ "eval_mean_token_accuracy": 0.8504375420259626,
+ "eval_num_tokens": 7735055.0,
+ "eval_runtime": 55.2538,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.221,
+ "step": 2760
+ },
+ {
+ "entropy": 0.2746732197701931,
+ "epoch": 6.699638118214716,
+ "grad_norm": 0.7725631594657898,
+ "learning_rate": 7.048348266600684e-05,
+ "loss": 0.22313270568847657,
+ "mean_token_accuracy": 0.9278106808662414,
+ "num_tokens": 7789582.0,
+ "step": 2780
+ },
+ {
+ "epoch": 6.699638118214716,
+ "eval_entropy": 0.3446769873412807,
+ "eval_loss": 0.6121717095375061,
+ "eval_mean_token_accuracy": 0.8509121691243032,
+ "eval_num_tokens": 7789582.0,
+ "eval_runtime": 55.2595,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 2780
+ },
+ {
+ "entropy": 0.287082564458251,
+ "epoch": 6.74788902291918,
+ "grad_norm": 0.6063140630722046,
+ "learning_rate": 6.866706719348931e-05,
+ "loss": 0.22704455852508545,
+ "mean_token_accuracy": 0.9244498163461685,
+ "num_tokens": 7843628.0,
+ "step": 2800
+ },
+ {
+ "epoch": 6.74788902291918,
+ "eval_entropy": 0.34377153686593087,
+ "eval_loss": 0.6114247441291809,
+ "eval_mean_token_accuracy": 0.8516378858116236,
+ "eval_num_tokens": 7843628.0,
+ "eval_runtime": 55.2449,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 2800
+ },
+ {
+ "entropy": 0.2760728094726801,
+ "epoch": 6.796139927623643,
+ "grad_norm": 0.6179378628730774,
+ "learning_rate": 6.686477534771416e-05,
+ "loss": 0.22595617771148682,
+ "mean_token_accuracy": 0.9256764411926269,
+ "num_tokens": 7902270.0,
+ "step": 2820
+ },
+ {
+ "epoch": 6.796139927623643,
+ "eval_entropy": 0.342443875047598,
+ "eval_loss": 0.6084980964660645,
+ "eval_mean_token_accuracy": 0.8524504812915673,
+ "eval_num_tokens": 7902270.0,
+ "eval_runtime": 55.2286,
+ "eval_samples_per_second": 25.711,
+ "eval_steps_per_second": 3.223,
+ "step": 2820
+ },
+ {
+ "entropy": 0.27494382336735723,
+ "epoch": 6.844390832328106,
+ "grad_norm": 0.781110942363739,
+ "learning_rate": 6.507711715595483e-05,
+ "loss": 0.22353668212890626,
+ "mean_token_accuracy": 0.9271390274167061,
+ "num_tokens": 7958644.0,
+ "step": 2840
+ },
+ {
+ "epoch": 6.844390832328106,
+ "eval_entropy": 0.33768313754810375,
+ "eval_loss": 0.614266574382782,
+ "eval_mean_token_accuracy": 0.851384595538793,
+ "eval_num_tokens": 7958644.0,
+ "eval_runtime": 55.2351,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 2840
+ },
+ {
+ "entropy": 0.27367788292467593,
+ "epoch": 6.892641737032569,
+ "grad_norm": 0.836021363735199,
+ "learning_rate": 6.330459850433355e-05,
+ "loss": 0.222139310836792,
+ "mean_token_accuracy": 0.9269705146551133,
+ "num_tokens": 8014311.0,
+ "step": 2860
+ },
+ {
+ "epoch": 6.892641737032569,
+ "eval_entropy": 0.3395471740304754,
+ "eval_loss": 0.6169298887252808,
+ "eval_mean_token_accuracy": 0.8521519695105177,
+ "eval_num_tokens": 8014311.0,
+ "eval_runtime": 55.2524,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2860
+ },
+ {
+ "entropy": 0.27472747303545475,
+ "epoch": 6.940892641737032,
+ "grad_norm": 0.6614536643028259,
+ "learning_rate": 6.154772099466185e-05,
+ "loss": 0.222674560546875,
+ "mean_token_accuracy": 0.9267930790781975,
+ "num_tokens": 8074986.0,
+ "step": 2880
+ },
+ {
+ "epoch": 6.940892641737032,
+ "eval_entropy": 0.3411581661928906,
+ "eval_loss": 0.6118640303611755,
+ "eval_mean_token_accuracy": 0.8519984858759334,
+ "eval_num_tokens": 8074986.0,
+ "eval_runtime": 55.2525,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2880
+ },
+ {
+ "entropy": 0.275695338845253,
+ "epoch": 6.989143546441496,
+ "grad_norm": 0.6573458313941956,
+ "learning_rate": 5.980698180249315e-05,
+ "loss": 0.2251124620437622,
+ "mean_token_accuracy": 0.926266947388649,
+ "num_tokens": 8132041.0,
+ "step": 2900
+ },
+ {
+ "epoch": 6.989143546441496,
+ "eval_entropy": 0.34243478517184095,
+ "eval_loss": 0.6122242212295532,
+ "eval_mean_token_accuracy": 0.8515615922011687,
+ "eval_num_tokens": 8132041.0,
+ "eval_runtime": 55.2521,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 2900
+ },
+ {
+ "entropy": 0.24234489026742104,
+ "epoch": 7.036188178528348,
+ "grad_norm": 0.8545143604278564,
+ "learning_rate": 5.808287353642782e-05,
+ "loss": 0.167067813873291,
+ "mean_token_accuracy": 0.9460623829792707,
+ "num_tokens": 8188174.0,
+ "step": 2920
+ },
+ {
+ "epoch": 7.036188178528348,
+ "eval_entropy": 0.2994119189261051,
+ "eval_loss": 0.7004832029342651,
+ "eval_mean_token_accuracy": 0.8472452401445153,
+ "eval_num_tokens": 8188174.0,
+ "eval_runtime": 55.2743,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 2920
+ },
+ {
+ "entropy": 0.21194725222885608,
+ "epoch": 7.084439083232811,
+ "grad_norm": 0.7077902555465698,
+ "learning_rate": 5.637588409871098e-05,
+ "loss": 0.14837799072265626,
+ "mean_token_accuracy": 0.951434426009655,
+ "num_tokens": 8244016.0,
+ "step": 2940
+ },
+ {
+ "epoch": 7.084439083232811,
+ "eval_entropy": 0.2990778482864412,
+ "eval_loss": 0.6822870969772339,
+ "eval_mean_token_accuracy": 0.8486404402202434,
+ "eval_num_tokens": 8244016.0,
+ "eval_runtime": 55.2489,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 2940
+ },
+ {
+ "entropy": 0.20270478539168835,
+ "epoch": 7.132689987937274,
+ "grad_norm": 0.7954618334770203,
+ "learning_rate": 5.468649654716176e-05,
+ "loss": 0.14337145090103148,
+ "mean_token_accuracy": 0.9537484034895897,
+ "num_tokens": 8299545.0,
+ "step": 2960
+ },
+ {
+ "epoch": 7.132689987937274,
+ "eval_entropy": 0.30174569743737745,
+ "eval_loss": 0.6869224905967712,
+ "eval_mean_token_accuracy": 0.8480379139439443,
+ "eval_num_tokens": 8299545.0,
+ "eval_runtime": 55.2427,
+ "eval_samples_per_second": 25.705,
+ "eval_steps_per_second": 3.222,
+ "step": 2960
+ },
+ {
+ "entropy": 0.2108145073056221,
+ "epoch": 7.180940892641737,
+ "grad_norm": 0.6713868379592896,
+ "learning_rate": 5.3015188958473624e-05,
+ "loss": 0.14596234560012816,
+ "mean_token_accuracy": 0.9524013876914978,
+ "num_tokens": 8352460.0,
+ "step": 2980
+ },
+ {
+ "epoch": 7.180940892641737,
+ "eval_entropy": 0.2993907957766833,
+ "eval_loss": 0.6884537935256958,
+ "eval_mean_token_accuracy": 0.8481567679496294,
+ "eval_num_tokens": 8352460.0,
+ "eval_runtime": 55.2571,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 2980
+ },
+ {
+ "entropy": 0.20720747038722037,
+ "epoch": 7.2291917973462,
+ "grad_norm": 0.7463769912719727,
+ "learning_rate": 5.136243429292464e-05,
+ "loss": 0.14544438123703002,
+ "mean_token_accuracy": 0.9533925041556358,
+ "num_tokens": 8411328.0,
+ "step": 3000
+ },
+ {
+ "epoch": 7.2291917973462,
+ "eval_entropy": 0.29728540635845635,
+ "eval_loss": 0.6921409964561462,
+ "eval_mean_token_accuracy": 0.8480995625592349,
+ "eval_num_tokens": 8411328.0,
+ "eval_runtime": 55.2493,
+ "eval_samples_per_second": 25.702,
+ "eval_steps_per_second": 3.222,
+ "step": 3000
+ },
+ {
+ "entropy": 0.20696109160780907,
+ "epoch": 7.277442702050664,
+ "grad_norm": 0.7309594750404358,
+ "learning_rate": 4.972870026053484e-05,
+ "loss": 0.14989933967590333,
+ "mean_token_accuracy": 0.9511091738939286,
+ "num_tokens": 8466715.0,
+ "step": 3020
+ },
+ {
+ "epoch": 7.277442702050664,
+ "eval_entropy": 0.2941792637444614,
+ "eval_loss": 0.6908664107322693,
+ "eval_mean_token_accuracy": 0.8489746732658214,
+ "eval_num_tokens": 8466715.0,
+ "eval_runtime": 55.2319,
+ "eval_samples_per_second": 25.71,
+ "eval_steps_per_second": 3.223,
+ "step": 3020
+ },
+ {
+ "entropy": 0.20037804245948793,
+ "epoch": 7.325693606755126,
+ "grad_norm": 0.8592659831047058,
+ "learning_rate": 4.811444918871029e-05,
+ "loss": 0.1415112853050232,
+ "mean_token_accuracy": 0.9539519399404526,
+ "num_tokens": 8524635.0,
+ "step": 3040
+ },
+ {
+ "epoch": 7.325693606755126,
+ "eval_entropy": 0.29864797372831386,
+ "eval_loss": 0.6853435635566711,
+ "eval_mean_token_accuracy": 0.8484187939863527,
+ "eval_num_tokens": 8524635.0,
+ "eval_runtime": 55.2473,
+ "eval_samples_per_second": 25.703,
+ "eval_steps_per_second": 3.222,
+ "step": 3040
+ },
+ {
+ "entropy": 0.21042127199470997,
+ "epoch": 7.37394451145959,
+ "grad_norm": 0.734545111656189,
+ "learning_rate": 4.652013789140951e-05,
+ "loss": 0.15053329467773438,
+ "mean_token_accuracy": 0.9511837676167488,
+ "num_tokens": 8580204.0,
+ "step": 3060
+ },
+ {
+ "epoch": 7.37394451145959,
+ "eval_entropy": 0.2982367055302256,
+ "eval_loss": 0.6894978284835815,
+ "eval_mean_token_accuracy": 0.8486884664953425,
+ "eval_num_tokens": 8580204.0,
+ "eval_runtime": 55.2784,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 3060
+ },
+ {
+ "entropy": 0.210346744582057,
+ "epoch": 7.422195416164053,
+ "grad_norm": 0.8108986020088196,
+ "learning_rate": 4.4946217539870706e-05,
+ "loss": 0.14928361177444457,
+ "mean_token_accuracy": 0.9498722046613693,
+ "num_tokens": 8637335.0,
+ "step": 3080
+ },
+ {
+ "epoch": 7.422195416164053,
+ "eval_entropy": 0.2913190031821808,
+ "eval_loss": 0.6952372193336487,
+ "eval_mean_token_accuracy": 0.8496706505839744,
+ "eval_num_tokens": 8637335.0,
+ "eval_runtime": 55.2728,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 3080
+ },
+ {
+ "entropy": 0.20841738171875476,
+ "epoch": 7.470446320868517,
+ "grad_norm": 0.8051531910896301,
+ "learning_rate": 4.339313353493576e-05,
+ "loss": 0.1464880108833313,
+ "mean_token_accuracy": 0.9526000887155532,
+ "num_tokens": 8691346.0,
+ "step": 3100
+ },
+ {
+ "epoch": 7.470446320868517,
+ "eval_entropy": 0.2890907092375702,
+ "eval_loss": 0.7007566690444946,
+ "eval_mean_token_accuracy": 0.8492257896434056,
+ "eval_num_tokens": 8691346.0,
+ "eval_runtime": 55.2694,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3100
+ },
+ {
+ "entropy": 0.19933607392013072,
+ "epoch": 7.518697225572979,
+ "grad_norm": 0.7979084849357605,
+ "learning_rate": 4.186132538100677e-05,
+ "loss": 0.14407336711883545,
+ "mean_token_accuracy": 0.9528203010559082,
+ "num_tokens": 8748823.0,
+ "step": 3120
+ },
+ {
+ "epoch": 7.518697225572979,
+ "eval_entropy": 0.294091603226876,
+ "eval_loss": 0.6922751665115356,
+ "eval_mean_token_accuracy": 0.8485970986023378,
+ "eval_num_tokens": 8748823.0,
+ "eval_runtime": 55.2663,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 3120
+ },
+ {
+ "entropy": 0.19975723810493945,
+ "epoch": 7.566948130277443,
+ "grad_norm": 0.7901250720024109,
+ "learning_rate": 4.035122656167186e-05,
+ "loss": 0.14221296310424805,
+ "mean_token_accuracy": 0.9538118690252304,
+ "num_tokens": 8808835.0,
+ "step": 3140
+ },
+ {
+ "epoch": 7.566948130277443,
+ "eval_entropy": 0.2950711101293564,
+ "eval_loss": 0.690978467464447,
+ "eval_mean_token_accuracy": 0.8483095872268248,
+ "eval_num_tokens": 8808835.0,
+ "eval_runtime": 55.2705,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 3140
+ },
+ {
+ "entropy": 0.2083854541182518,
+ "epoch": 7.615199034981906,
+ "grad_norm": 0.6864560842514038,
+ "learning_rate": 3.886326441703407e-05,
+ "loss": 0.15056604146957397,
+ "mean_token_accuracy": 0.9521368011832237,
+ "num_tokens": 8864794.0,
+ "step": 3160
+ },
+ {
+ "epoch": 7.615199034981906,
+ "eval_entropy": 0.29514283318532986,
+ "eval_loss": 0.6886241436004639,
+ "eval_mean_token_accuracy": 0.8497069950184125,
+ "eval_num_tokens": 8864794.0,
+ "eval_runtime": 55.2259,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 3160
+ },
+ {
+ "entropy": 0.21473116055130959,
+ "epoch": 7.66344993968637,
+ "grad_norm": 0.7526962161064148,
+ "learning_rate": 3.739786002277949e-05,
+ "loss": 0.14960399866104127,
+ "mean_token_accuracy": 0.9506274402141571,
+ "num_tokens": 8919306.0,
+ "step": 3180
+ },
+ {
+ "epoch": 7.66344993968637,
+ "eval_entropy": 0.29309218067131687,
+ "eval_loss": 0.6990856528282166,
+ "eval_mean_token_accuracy": 0.8479215161184247,
+ "eval_num_tokens": 8919306.0,
+ "eval_runtime": 55.1684,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 3180
+ },
+ {
+ "entropy": 0.20980504602193834,
+ "epoch": 7.711700844390832,
+ "grad_norm": 0.8401498198509216,
+ "learning_rate": 3.5955428071017554e-05,
+ "loss": 0.14723907709121703,
+ "mean_token_accuracy": 0.9517867639660835,
+ "num_tokens": 8973330.0,
+ "step": 3200
+ },
+ {
+ "epoch": 7.711700844390832,
+ "eval_entropy": 0.2953076950284872,
+ "eval_loss": 0.6917204260826111,
+ "eval_mean_token_accuracy": 0.8481054918819599,
+ "eval_num_tokens": 8973330.0,
+ "eval_runtime": 55.1731,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3200
+ },
+ {
+ "entropy": 0.20456759482622147,
+ "epoch": 7.759951749095295,
+ "grad_norm": 0.7541831731796265,
+ "learning_rate": 3.453637675292839e-05,
+ "loss": 0.14354816675186158,
+ "mean_token_accuracy": 0.9517848521471024,
+ "num_tokens": 9029959.0,
+ "step": 3220
+ },
+ {
+ "epoch": 7.759951749095295,
+ "eval_entropy": 0.2932077002491844,
+ "eval_loss": 0.6895456910133362,
+ "eval_mean_token_accuracy": 0.8489299101775951,
+ "eval_num_tokens": 9029959.0,
+ "eval_runtime": 55.1675,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3220
+ },
+ {
+ "entropy": 0.2000926498323679,
+ "epoch": 7.808202653799759,
+ "grad_norm": 0.7815287113189697,
+ "learning_rate": 3.3141107643249166e-05,
+ "loss": 0.1415714740753174,
+ "mean_token_accuracy": 0.9534137606620788,
+ "num_tokens": 9088384.0,
+ "step": 3240
+ },
+ {
+ "epoch": 7.808202653799759,
+ "eval_entropy": 0.2927940240067043,
+ "eval_loss": 0.6980717182159424,
+ "eval_mean_token_accuracy": 0.8483662980326107,
+ "eval_num_tokens": 9088384.0,
+ "eval_runtime": 55.1596,
+ "eval_samples_per_second": 25.743,
+ "eval_steps_per_second": 3.227,
+ "step": 3240
+ },
+ {
+ "entropy": 0.2064586240798235,
+ "epoch": 7.856453558504222,
+ "grad_norm": 0.6987279653549194,
+ "learning_rate": 3.177001558663355e-05,
+ "loss": 0.1457617998123169,
+ "mean_token_accuracy": 0.9520302176475525,
+ "num_tokens": 9144987.0,
+ "step": 3260
+ },
+ {
+ "epoch": 7.856453558504222,
+ "eval_entropy": 0.29009542522135745,
+ "eval_loss": 0.6970582008361816,
+ "eval_mean_token_accuracy": 0.848495197095228,
+ "eval_num_tokens": 9144987.0,
+ "eval_runtime": 55.1791,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3260
+ },
+ {
+ "entropy": 0.20724620223045348,
+ "epoch": 7.904704463208685,
+ "grad_norm": 0.8427369594573975,
+ "learning_rate": 3.0423488585915043e-05,
+ "loss": 0.14918961524963378,
+ "mean_token_accuracy": 0.9520222991704941,
+ "num_tokens": 9198736.0,
+ "step": 3280
+ },
+ {
+ "epoch": 7.904704463208685,
+ "eval_entropy": 0.29056421843137636,
+ "eval_loss": 0.6968309879302979,
+ "eval_mean_token_accuracy": 0.8493872813964158,
+ "eval_num_tokens": 9198736.0,
+ "eval_runtime": 55.1904,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3280
+ },
+ {
+ "entropy": 0.20750712640583516,
+ "epoch": 7.952955367913148,
+ "grad_norm": 0.699073076248169,
+ "learning_rate": 2.910190769230703e-05,
+ "loss": 0.1463977336883545,
+ "mean_token_accuracy": 0.9525043666362762,
+ "num_tokens": 9254624.0,
+ "step": 3300
+ },
+ {
+ "epoch": 7.952955367913148,
+ "eval_entropy": 0.2956364156489962,
+ "eval_loss": 0.6904668211936951,
+ "eval_mean_token_accuracy": 0.8487593923391921,
+ "eval_num_tokens": 9254624.0,
+ "eval_runtime": 55.1975,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3300
+ },
+ {
+ "entropy": 0.2009814847738315,
+ "epoch": 8.0,
+ "grad_norm": 2.192417621612549,
+ "learning_rate": 2.7805646897569558e-05,
+ "loss": 0.14293937683105468,
+ "mean_token_accuracy": 0.95244728296231,
+ "num_tokens": 9309504.0,
+ "step": 3320
+ },
+ {
+ "epoch": 8.0,
+ "eval_entropy": 0.2898739650008384,
+ "eval_loss": 0.6985539793968201,
+ "eval_mean_token_accuracy": 0.8486044373405114,
+ "eval_num_tokens": 9309504.0,
+ "eval_runtime": 55.1874,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 3320
+ },
+ {
+ "entropy": 0.1747811622917652,
+ "epoch": 8.048250904704464,
+ "grad_norm": 0.6563359498977661,
+ "learning_rate": 2.653507302817429e-05,
+ "loss": 0.09911853075027466,
+ "mean_token_accuracy": 0.9703438818454743,
+ "num_tokens": 9363278.0,
+ "step": 3340
+ },
+ {
+ "epoch": 8.048250904704464,
+ "eval_entropy": 0.2652904349431563,
+ "eval_loss": 0.7738804817199707,
+ "eval_mean_token_accuracy": 0.8466940669531233,
+ "eval_num_tokens": 9363278.0,
+ "eval_runtime": 55.1838,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3340
+ },
+ {
+ "entropy": 0.15915328189730643,
+ "epoch": 8.096501809408926,
+ "grad_norm": 0.7250285744667053,
+ "learning_rate": 2.5290545641496805e-05,
+ "loss": 0.09311577081680297,
+ "mean_token_accuracy": 0.9713594883680343,
+ "num_tokens": 9418505.0,
+ "step": 3360
+ },
+ {
+ "epoch": 8.096501809408926,
+ "eval_entropy": 0.26534568283999904,
+ "eval_loss": 0.7698941826820374,
+ "eval_mean_token_accuracy": 0.8466719904642427,
+ "eval_num_tokens": 9418505.0,
+ "eval_runtime": 55.1968,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3360
+ },
+ {
+ "entropy": 0.1663580035790801,
+ "epoch": 8.14475271411339,
+ "grad_norm": 0.6403616070747375,
+ "learning_rate": 2.4072416924066163e-05,
+ "loss": 0.1001995325088501,
+ "mean_token_accuracy": 0.9687129512429238,
+ "num_tokens": 9473735.0,
+ "step": 3380
+ },
+ {
+ "epoch": 8.14475271411339,
+ "eval_entropy": 0.2670054007949454,
+ "eval_loss": 0.7662967443466187,
+ "eval_mean_token_accuracy": 0.846786938021692,
+ "eval_num_tokens": 9473735.0,
+ "eval_runtime": 55.1841,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3380
+ },
+ {
+ "entropy": 0.15221141315996647,
+ "epoch": 8.193003618817853,
+ "grad_norm": 0.7957155108451843,
+ "learning_rate": 2.2881031591900387e-05,
+ "loss": 0.08914719820022583,
+ "mean_token_accuracy": 0.9729975983500481,
+ "num_tokens": 9534946.0,
+ "step": 3400
+ },
+ {
+ "epoch": 8.193003618817853,
+ "eval_entropy": 0.26733438310663354,
+ "eval_loss": 0.7707550525665283,
+ "eval_mean_token_accuracy": 0.8461319924740309,
+ "eval_num_tokens": 9534946.0,
+ "eval_runtime": 55.1757,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3400
+ },
+ {
+ "entropy": 0.15542738791555166,
+ "epoch": 8.241254523522317,
+ "grad_norm": 0.5750007629394531,
+ "learning_rate": 2.171672679295568e-05,
+ "loss": 0.09130602478981018,
+ "mean_token_accuracy": 0.9710182502865792,
+ "num_tokens": 9596302.0,
+ "step": 3420
+ },
+ {
+ "epoch": 8.241254523522317,
+ "eval_entropy": 0.26393357898746983,
+ "eval_loss": 0.7720436453819275,
+ "eval_mean_token_accuracy": 0.8474090959918633,
+ "eval_num_tokens": 9596302.0,
+ "eval_runtime": 55.1825,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3420
+ },
+ {
+ "entropy": 0.16351462248712778,
+ "epoch": 8.289505428226779,
+ "grad_norm": 0.7309626340866089,
+ "learning_rate": 2.057983201171781e-05,
+ "loss": 0.0985127031803131,
+ "mean_token_accuracy": 0.9697432905435562,
+ "num_tokens": 9646329.0,
+ "step": 3440
+ },
+ {
+ "epoch": 8.289505428226779,
+ "eval_entropy": 0.2609939365072197,
+ "eval_loss": 0.7792959213256836,
+ "eval_mean_token_accuracy": 0.8471295776661862,
+ "eval_num_tokens": 9646329.0,
+ "eval_runtime": 55.194,
+ "eval_samples_per_second": 25.727,
+ "eval_steps_per_second": 3.225,
+ "step": 3440
+ },
+ {
+ "entropy": 0.1608191981911659,
+ "epoch": 8.337756332931242,
+ "grad_norm": 0.6144416928291321,
+ "learning_rate": 1.947066897596166e-05,
+ "loss": 0.09536871314048767,
+ "mean_token_accuracy": 0.9700575843453407,
+ "num_tokens": 9703269.0,
+ "step": 3460
+ },
+ {
+ "epoch": 8.337756332931242,
+ "eval_entropy": 0.2646808089332634,
+ "eval_loss": 0.7733453512191772,
+ "eval_mean_token_accuracy": 0.8464634428533275,
+ "eval_num_tokens": 9703269.0,
+ "eval_runtime": 55.1981,
+ "eval_samples_per_second": 25.726,
+ "eval_steps_per_second": 3.225,
+ "step": 3460
+ },
+ {
+ "entropy": 0.15692227762192487,
+ "epoch": 8.386007237635706,
+ "grad_norm": 0.7023665308952332,
+ "learning_rate": 1.8389551565706204e-05,
+ "loss": 0.0954119086265564,
+ "mean_token_accuracy": 0.9701876968145371,
+ "num_tokens": 9760710.0,
+ "step": 3480
+ },
+ {
+ "epoch": 8.386007237635706,
+ "eval_entropy": 0.25970463814695227,
+ "eval_loss": 0.7781035304069519,
+ "eval_mean_token_accuracy": 0.8474177078584607,
+ "eval_num_tokens": 9760710.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3480
+ },
+ {
+ "entropy": 0.16045771054923536,
+ "epoch": 8.43425814234017,
+ "grad_norm": 0.6111961007118225,
+ "learning_rate": 1.7336785724390205e-05,
+ "loss": 0.09789881110191345,
+ "mean_token_accuracy": 0.969735924899578,
+ "num_tokens": 9816150.0,
+ "step": 3500
+ },
+ {
+ "epoch": 8.43425814234017,
+ "eval_entropy": 0.26607431437862056,
+ "eval_loss": 0.7712005972862244,
+ "eval_mean_token_accuracy": 0.8459033410200912,
+ "eval_num_tokens": 9816150.0,
+ "eval_runtime": 55.1744,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3500
+ },
+ {
+ "entropy": 0.1559722937643528,
+ "epoch": 8.482509047044632,
+ "grad_norm": 0.6566210389137268,
+ "learning_rate": 1.6312669372293666e-05,
+ "loss": 0.09393646121025086,
+ "mean_token_accuracy": 0.9715360820293426,
+ "num_tokens": 9871871.0,
+ "step": 3520
+ },
+ {
+ "epoch": 8.482509047044632,
+ "eval_entropy": 0.2609174002924662,
+ "eval_loss": 0.7844049334526062,
+ "eval_mean_token_accuracy": 0.8466568261050107,
+ "eval_num_tokens": 9871871.0,
+ "eval_runtime": 55.1787,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 3520
+ },
+ {
+ "entropy": 0.1532884443178773,
+ "epoch": 8.530759951749095,
+ "grad_norm": 0.6216753721237183,
+ "learning_rate": 1.531749232223018e-05,
+ "loss": 0.09293950200080872,
+ "mean_token_accuracy": 0.9714278027415275,
+ "num_tokens": 9928709.0,
+ "step": 3540
+ },
+ {
+ "epoch": 8.530759951749095,
+ "eval_entropy": 0.2630744833457336,
+ "eval_loss": 0.7802227139472961,
+ "eval_mean_token_accuracy": 0.846255295062333,
+ "eval_num_tokens": 9928709.0,
+ "eval_runtime": 55.1631,
+ "eval_samples_per_second": 25.742,
+ "eval_steps_per_second": 3.227,
+ "step": 3540
+ },
+ {
+ "entropy": 0.1553487192839384,
+ "epoch": 8.579010856453559,
+ "grad_norm": 0.7456594705581665,
+ "learning_rate": 1.4351536197533074e-05,
+ "loss": 0.09566901326179504,
+ "mean_token_accuracy": 0.97108214199543,
+ "num_tokens": 9984558.0,
+ "step": 3560
+ },
+ {
+ "epoch": 8.579010856453559,
+ "eval_entropy": 0.26115024215384813,
+ "eval_loss": 0.7809329628944397,
+ "eval_mean_token_accuracy": 0.8467723797546344,
+ "eval_num_tokens": 9984558.0,
+ "eval_runtime": 55.148,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3560
+ },
+ {
+ "entropy": 0.1595249420031905,
+ "epoch": 8.627261761158021,
+ "grad_norm": 0.7340289950370789,
+ "learning_rate": 1.3415074352359433e-05,
+ "loss": 0.09668846726417542,
+ "mean_token_accuracy": 0.9696339756250382,
+ "num_tokens": 10039763.0,
+ "step": 3580
+ },
+ {
+ "epoch": 8.627261761158021,
+ "eval_entropy": 0.2647377721379312,
+ "eval_loss": 0.7739617228507996,
+ "eval_mean_token_accuracy": 0.8465174942204122,
+ "eval_num_tokens": 10039763.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3580
+ },
+ {
+ "entropy": 0.1583833245560527,
+ "epoch": 8.675512665862485,
+ "grad_norm": 0.6431241631507874,
+ "learning_rate": 1.2508371794334224e-05,
+ "loss": 0.09521735310554505,
+ "mean_token_accuracy": 0.9699321657419204,
+ "num_tokens": 10095098.0,
+ "step": 3600
+ },
+ {
+ "epoch": 8.675512665862485,
+ "eval_entropy": 0.2621892829624455,
+ "eval_loss": 0.7764760255813599,
+ "eval_mean_token_accuracy": 0.8469243655713756,
+ "eval_num_tokens": 10095098.0,
+ "eval_runtime": 55.1393,
+ "eval_samples_per_second": 25.753,
+ "eval_steps_per_second": 3.228,
+ "step": 3600
+ },
+ {
+ "entropy": 0.15372174456715584,
+ "epoch": 8.723763570566948,
+ "grad_norm": 0.632522702217102,
+ "learning_rate": 1.163168510955608e-05,
+ "loss": 0.09367120265960693,
+ "mean_token_accuracy": 0.9709951281547546,
+ "num_tokens": 10152831.0,
+ "step": 3620
+ },
+ {
+ "epoch": 8.723763570566948,
+ "eval_entropy": 0.26019893243406594,
+ "eval_loss": 0.7842312455177307,
+ "eval_mean_token_accuracy": 0.8465140887190786,
+ "eval_num_tokens": 10152831.0,
+ "eval_runtime": 55.1348,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3620
+ },
+ {
+ "entropy": 0.15217966660857202,
+ "epoch": 8.772014475271412,
+ "grad_norm": 0.5612668395042419,
+ "learning_rate": 1.078526238998661e-05,
+ "loss": 0.09193292260169983,
+ "mean_token_accuracy": 0.9713213533163071,
+ "num_tokens": 10209639.0,
+ "step": 3640
+ },
+ {
+ "epoch": 8.772014475271412,
+ "eval_entropy": 0.26003232162989925,
+ "eval_loss": 0.7822859883308411,
+ "eval_mean_token_accuracy": 0.8468798703691932,
+ "eval_num_tokens": 10209639.0,
+ "eval_runtime": 55.1916,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 3640
+ },
+ {
+ "entropy": 0.15681463126093148,
+ "epoch": 8.820265379975874,
+ "grad_norm": 0.5372915267944336,
+ "learning_rate": 9.969343163243224e-06,
+ "loss": 0.09630222916603089,
+ "mean_token_accuracy": 0.9702456504106521,
+ "num_tokens": 10263716.0,
+ "step": 3660
+ },
+ {
+ "epoch": 8.820265379975874,
+ "eval_entropy": 0.2598928563882796,
+ "eval_loss": 0.7829110026359558,
+ "eval_mean_token_accuracy": 0.8469086342983032,
+ "eval_num_tokens": 10263716.0,
+ "eval_runtime": 55.1354,
+ "eval_samples_per_second": 25.755,
+ "eval_steps_per_second": 3.228,
+ "step": 3660
+ },
+ {
+ "entropy": 0.1535819811746478,
+ "epoch": 8.868516284680338,
+ "grad_norm": 0.575184166431427,
+ "learning_rate": 9.184158324815683e-06,
+ "loss": 0.09521135687828064,
+ "mean_token_accuracy": 0.9696467980742455,
+ "num_tokens": 10320171.0,
+ "step": 3680
+ },
+ {
+ "epoch": 8.868516284680338,
+ "eval_entropy": 0.2629028752948461,
+ "eval_loss": 0.7783747911453247,
+ "eval_mean_token_accuracy": 0.8463931284593732,
+ "eval_num_tokens": 10320171.0,
+ "eval_runtime": 55.1499,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3680
+ },
+ {
+ "entropy": 0.15672272052615882,
+ "epoch": 8.916767189384801,
+ "grad_norm": 0.610146701335907,
+ "learning_rate": 8.429930072725457e-06,
+ "loss": 0.09335047006607056,
+ "mean_token_accuracy": 0.9709506019949913,
+ "num_tokens": 10377086.0,
+ "step": 3700
+ },
+ {
+ "epoch": 8.916767189384801,
+ "eval_entropy": 0.26106021611878044,
+ "eval_loss": 0.7824530005455017,
+ "eval_mean_token_accuracy": 0.8467274777005228,
+ "eval_num_tokens": 10377086.0,
+ "eval_runtime": 55.1677,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3700
+ },
+ {
+ "entropy": 0.15496804118156432,
+ "epoch": 8.965018094089265,
+ "grad_norm": 0.8847843408584595,
+ "learning_rate": 7.706871844646178e-06,
+ "loss": 0.09552072882652282,
+ "mean_token_accuracy": 0.9693930178880692,
+ "num_tokens": 10435272.0,
+ "step": 3720
+ },
+ {
+ "epoch": 8.965018094089265,
+ "eval_entropy": 0.26175538701622675,
+ "eval_loss": 0.7833470106124878,
+ "eval_mean_token_accuracy": 0.8464220507761065,
+ "eval_num_tokens": 10435272.0,
+ "eval_runtime": 55.1496,
+ "eval_samples_per_second": 25.748,
+ "eval_steps_per_second": 3.228,
+ "step": 3720
+ },
+ {
+ "entropy": 0.15739625711471605,
+ "epoch": 9.012062726176115,
+ "grad_norm": 0.35802221298217773,
+ "learning_rate": 7.0151882575034775e-06,
+ "loss": 0.09389110803604125,
+ "mean_token_accuracy": 0.9715636464265677,
+ "num_tokens": 10487076.0,
+ "step": 3740
+ },
+ {
+ "epoch": 9.012062726176115,
+ "eval_entropy": 0.26084648198291155,
+ "eval_loss": 0.7853291034698486,
+ "eval_mean_token_accuracy": 0.8463665585169632,
+ "eval_num_tokens": 10487076.0,
+ "eval_runtime": 55.1572,
+ "eval_samples_per_second": 25.745,
+ "eval_steps_per_second": 3.227,
+ "step": 3740
+ },
+ {
+ "entropy": 0.14215838070958853,
+ "epoch": 9.060313630880579,
+ "grad_norm": 0.4684004783630371,
+ "learning_rate": 6.35507504957069e-06,
+ "loss": 0.07583575248718262,
+ "mean_token_accuracy": 0.9782516390085221,
+ "num_tokens": 10543520.0,
+ "step": 3760
+ },
+ {
+ "epoch": 9.060313630880579,
+ "eval_entropy": 0.254280548333452,
+ "eval_loss": 0.807111382484436,
+ "eval_mean_token_accuracy": 0.8463215992022096,
+ "eval_num_tokens": 10543520.0,
+ "eval_runtime": 55.1805,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 3760
+ },
+ {
+ "entropy": 0.1408092312514782,
+ "epoch": 9.108564535585042,
+ "grad_norm": 0.43550431728363037,
+ "learning_rate": 5.726719025077231e-06,
+ "loss": 0.07781847715377807,
+ "mean_token_accuracy": 0.9768255725502968,
+ "num_tokens": 10596528.0,
+ "step": 3780
+ },
+ {
+ "epoch": 9.108564535585042,
+ "eval_entropy": 0.2505798229340757,
+ "eval_loss": 0.8224650025367737,
+ "eval_mean_token_accuracy": 0.8463811020502884,
+ "eval_num_tokens": 10596528.0,
+ "eval_runtime": 55.1754,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 3780
+ },
+ {
+ "entropy": 0.13897503707557918,
+ "epoch": 9.156815440289506,
+ "grad_norm": 0.5255228281021118,
+ "learning_rate": 5.130298001345343e-06,
+ "loss": 0.07528382539749146,
+ "mean_token_accuracy": 0.9768329098820686,
+ "num_tokens": 10655544.0,
+ "step": 3800
+ },
+ {
+ "epoch": 9.156815440289506,
+ "eval_entropy": 0.2509520294960965,
+ "eval_loss": 0.8247353434562683,
+ "eval_mean_token_accuracy": 0.845786559112956,
+ "eval_num_tokens": 10655544.0,
+ "eval_runtime": 55.151,
+ "eval_samples_per_second": 25.747,
+ "eval_steps_per_second": 3.228,
+ "step": 3800
+ },
+ {
+ "entropy": 0.13998530581593513,
+ "epoch": 9.205066344993968,
+ "grad_norm": 0.4610442817211151,
+ "learning_rate": 4.565980758469731e-06,
+ "loss": 0.07813523411750793,
+ "mean_token_accuracy": 0.9769201070070267,
+ "num_tokens": 10710737.0,
+ "step": 3820
+ },
+ {
+ "epoch": 9.205066344993968,
+ "eval_entropy": 0.2503350620691696,
+ "eval_loss": 0.8271610736846924,
+ "eval_mean_token_accuracy": 0.8455204879969693,
+ "eval_num_tokens": 10710737.0,
+ "eval_runtime": 55.1706,
+ "eval_samples_per_second": 25.738,
+ "eval_steps_per_second": 3.226,
+ "step": 3820
+ },
+ {
+ "entropy": 0.13162653651088477,
+ "epoch": 9.253317249698432,
+ "grad_norm": 0.3627360463142395,
+ "learning_rate": 4.033926991554922e-06,
+ "loss": 0.07141604423522949,
+ "mean_token_accuracy": 0.9792275875806808,
+ "num_tokens": 10771727.0,
+ "step": 3840
+ },
+ {
+ "epoch": 9.253317249698432,
+ "eval_entropy": 0.24897396087311627,
+ "eval_loss": 0.8307036757469177,
+ "eval_mean_token_accuracy": 0.8453606835911783,
+ "eval_num_tokens": 10771727.0,
+ "eval_runtime": 55.168,
+ "eval_samples_per_second": 25.74,
+ "eval_steps_per_second": 3.227,
+ "step": 3840
+ },
+ {
+ "entropy": 0.14422765467315912,
+ "epoch": 9.301568154402895,
+ "grad_norm": 0.5706251263618469,
+ "learning_rate": 3.53428726552335e-06,
+ "loss": 0.07931464314460754,
+ "mean_token_accuracy": 0.9752222061157226,
+ "num_tokens": 10825849.0,
+ "step": 3860
+ },
+ {
+ "epoch": 9.301568154402895,
+ "eval_entropy": 0.24914598389622872,
+ "eval_loss": 0.8312752842903137,
+ "eval_mean_token_accuracy": 0.8455510604917333,
+ "eval_num_tokens": 10825849.0,
+ "eval_runtime": 55.1849,
+ "eval_samples_per_second": 25.732,
+ "eval_steps_per_second": 3.226,
+ "step": 3860
+ },
+ {
+ "entropy": 0.13540438804775476,
+ "epoch": 9.349819059107359,
+ "grad_norm": 0.6334630250930786,
+ "learning_rate": 3.0672029725073196e-06,
+ "loss": 0.07548695802688599,
+ "mean_token_accuracy": 0.9778543844819069,
+ "num_tokens": 10884611.0,
+ "step": 3880
+ },
+ {
+ "epoch": 9.349819059107359,
+ "eval_entropy": 0.24910795722114906,
+ "eval_loss": 0.8316059112548828,
+ "eval_mean_token_accuracy": 0.8456973557391864,
+ "eval_num_tokens": 10884611.0,
+ "eval_runtime": 55.1728,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3880
+ },
+ {
+ "entropy": 0.1410717975348234,
+ "epoch": 9.398069963811821,
+ "grad_norm": 0.5009909868240356,
+ "learning_rate": 2.632806291836666e-06,
+ "loss": 0.07720760703086853,
+ "mean_token_accuracy": 0.9768648758530617,
+ "num_tokens": 10938300.0,
+ "step": 3900
+ },
+ {
+ "epoch": 9.398069963811821,
+ "eval_entropy": 0.24874219742048992,
+ "eval_loss": 0.8328408598899841,
+ "eval_mean_token_accuracy": 0.8455627888775943,
+ "eval_num_tokens": 10938300.0,
+ "eval_runtime": 55.1736,
+ "eval_samples_per_second": 25.737,
+ "eval_steps_per_second": 3.226,
+ "step": 3900
+ },
+ {
+ "entropy": 0.13674762714654207,
+ "epoch": 9.446320868516285,
+ "grad_norm": 0.46003177762031555,
+ "learning_rate": 2.231220152633621e-06,
+ "loss": 0.07583877444267273,
+ "mean_token_accuracy": 0.9772356480360032,
+ "num_tokens": 10994442.0,
+ "step": 3920
+ },
+ {
+ "epoch": 9.446320868516285,
+ "eval_entropy": 0.24896439030933915,
+ "eval_loss": 0.8331801891326904,
+ "eval_mean_token_accuracy": 0.8453999262177543,
+ "eval_num_tokens": 10994442.0,
+ "eval_runtime": 55.1815,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 3920
+ },
+ {
+ "entropy": 0.1378044320270419,
+ "epoch": 9.494571773220748,
+ "grad_norm": 0.5056300759315491,
+ "learning_rate": 1.862558199025263e-06,
+ "loss": 0.07420622110366822,
+ "mean_token_accuracy": 0.9771117404103279,
+ "num_tokens": 11052708.0,
+ "step": 3940
+ },
+ {
+ "epoch": 9.494571773220748,
+ "eval_entropy": 0.2490867761413703,
+ "eval_loss": 0.8338391780853271,
+ "eval_mean_token_accuracy": 0.8453461571355884,
+ "eval_num_tokens": 11052708.0,
+ "eval_runtime": 59.1613,
+ "eval_samples_per_second": 24.002,
+ "eval_steps_per_second": 3.009,
+ "step": 3940
+ },
+ {
+ "entropy": 0.13731490727514029,
+ "epoch": 9.54282267792521,
+ "grad_norm": 0.5770965218544006,
+ "learning_rate": 1.5269247579836162e-06,
+ "loss": 0.07547505497932434,
+ "mean_token_accuracy": 0.9772329092025757,
+ "num_tokens": 11106761.0,
+ "step": 3960
+ },
+ {
+ "epoch": 9.54282267792521,
+ "eval_entropy": 0.24866497918461147,
+ "eval_loss": 0.8349169492721558,
+ "eval_mean_token_accuracy": 0.8455062398080075,
+ "eval_num_tokens": 11106761.0,
+ "eval_runtime": 55.1484,
+ "eval_samples_per_second": 25.749,
+ "eval_steps_per_second": 3.228,
+ "step": 3960
+ },
+ {
+ "entropy": 0.13437952492386102,
+ "epoch": 9.591073582629674,
+ "grad_norm": 0.5746839046478271,
+ "learning_rate": 1.2244148098023241e-06,
+ "loss": 0.0719214141368866,
+ "mean_token_accuracy": 0.9783813208341599,
+ "num_tokens": 11163241.0,
+ "step": 3980
+ },
+ {
+ "epoch": 9.591073582629674,
+ "eval_entropy": 0.2487325757909357,
+ "eval_loss": 0.8345232009887695,
+ "eval_mean_token_accuracy": 0.8452854909923639,
+ "eval_num_tokens": 11163241.0,
+ "eval_runtime": 55.2029,
+ "eval_samples_per_second": 25.723,
+ "eval_steps_per_second": 3.224,
+ "step": 3980
+ },
+ {
+ "entropy": 0.1449177075177431,
+ "epoch": 9.639324487334138,
+ "grad_norm": 0.7029407024383545,
+ "learning_rate": 9.551139612183896e-07,
+ "loss": 0.08021060228347779,
+ "mean_token_accuracy": 0.9748102590441704,
+ "num_tokens": 11215311.0,
+ "step": 4000
+ },
+ {
+ "epoch": 9.639324487334138,
+ "eval_entropy": 0.24849342898036655,
+ "eval_loss": 0.8350681066513062,
+ "eval_mean_token_accuracy": 0.8454755872822879,
+ "eval_num_tokens": 11215311.0,
+ "eval_runtime": 55.1818,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4000
+ },
+ {
+ "entropy": 0.13718088436871767,
+ "epoch": 9.687575392038601,
+ "grad_norm": 0.4247730076313019,
+ "learning_rate": 7.190984211864178e-07,
+ "loss": 0.0763831913471222,
+ "mean_token_accuracy": 0.9777001023292542,
+ "num_tokens": 11272587.0,
+ "step": 4020
+ },
+ {
+ "epoch": 9.687575392038601,
+ "eval_entropy": 0.2485166782241189,
+ "eval_loss": 0.8347740769386292,
+ "eval_mean_token_accuracy": 0.8455832382936156,
+ "eval_num_tokens": 11272587.0,
+ "eval_runtime": 55.1819,
+ "eval_samples_per_second": 25.733,
+ "eval_steps_per_second": 3.226,
+ "step": 4020
+ },
+ {
+ "entropy": 0.1414831655099988,
+ "epoch": 9.735826296743063,
+ "grad_norm": 0.4344032108783722,
+ "learning_rate": 5.164349793124746e-07,
+ "loss": 0.0786937952041626,
+ "mean_token_accuracy": 0.9762448608875275,
+ "num_tokens": 11326845.0,
+ "step": 4040
+ },
+ {
+ "epoch": 9.735826296743063,
+ "eval_entropy": 0.2485147834326444,
+ "eval_loss": 0.8348782658576965,
+ "eval_mean_token_accuracy": 0.8454727480250798,
+ "eval_num_tokens": 11326845.0,
+ "eval_runtime": 55.1896,
+ "eval_samples_per_second": 25.729,
+ "eval_steps_per_second": 3.225,
+ "step": 4040
+ },
+ {
+ "entropy": 0.1397345969453454,
+ "epoch": 9.784077201447527,
+ "grad_norm": 0.5865362882614136,
+ "learning_rate": 3.4718098695330847e-07,
+ "loss": 0.07839923501014709,
+ "mean_token_accuracy": 0.9766460061073303,
+ "num_tokens": 11381321.0,
+ "step": 4060
+ },
+ {
+ "epoch": 9.784077201447527,
+ "eval_entropy": 0.24835219778371662,
+ "eval_loss": 0.8349990248680115,
+ "eval_mean_token_accuracy": 0.8452944608216875,
+ "eval_num_tokens": 11381321.0,
+ "eval_runtime": 55.1862,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4060
+ },
+ {
+ "entropy": 0.13877951726317406,
+ "epoch": 9.83232810615199,
+ "grad_norm": 0.36521604657173157,
+ "learning_rate": 2.1138434098667948e-07,
+ "loss": 0.0738587200641632,
+ "mean_token_accuracy": 0.9764896467328071,
+ "num_tokens": 11441968.0,
+ "step": 4080
+ },
+ {
+ "epoch": 9.83232810615199,
+ "eval_entropy": 0.24834532483240193,
+ "eval_loss": 0.8350111246109009,
+ "eval_mean_token_accuracy": 0.8456257598453694,
+ "eval_num_tokens": 11441968.0,
+ "eval_runtime": 55.1871,
+ "eval_samples_per_second": 25.731,
+ "eval_steps_per_second": 3.225,
+ "step": 4080
+ },
+ {
+ "entropy": 0.1345276204869151,
+ "epoch": 9.880579010856454,
+ "grad_norm": 0.45626401901245117,
+ "learning_rate": 1.0908347025708512e-07,
+ "loss": 0.07468653917312622,
+ "mean_token_accuracy": 0.978096280992031,
+ "num_tokens": 11500487.0,
+ "step": 4100
+ },
+ {
+ "epoch": 9.880579010856454,
+ "eval_entropy": 0.2485178895713238,
+ "eval_loss": 0.834865152835846,
+ "eval_mean_token_accuracy": 0.8453632285085957,
+ "eval_num_tokens": 11500487.0,
+ "eval_runtime": 55.1746,
+ "eval_samples_per_second": 25.736,
+ "eval_steps_per_second": 3.226,
+ "step": 4100
+ },
+ {
+ "entropy": 0.1314420524984598,
+ "epoch": 9.928829915560916,
+ "grad_norm": 0.5756514072418213,
+ "learning_rate": 4.0307324700819896e-08,
+ "loss": 0.07114983201026917,
+ "mean_token_accuracy": 0.9784522473812103,
+ "num_tokens": 11562246.0,
+ "step": 4120
+ },
+ {
+ "epoch": 9.928829915560916,
+ "eval_entropy": 0.24849412364236426,
+ "eval_loss": 0.8347920775413513,
+ "eval_mean_token_accuracy": 0.8454210158814205,
+ "eval_num_tokens": 11562246.0,
+ "eval_runtime": 55.179,
+ "eval_samples_per_second": 25.734,
+ "eval_steps_per_second": 3.226,
+ "step": 4120
+ },
+ {
+ "entropy": 0.14091254398226738,
+ "epoch": 9.97708082026538,
+ "grad_norm": 0.4619421064853668,
+ "learning_rate": 5.075367153567275e-09,
+ "loss": 0.07807959914207459,
+ "mean_token_accuracy": 0.9760556846857071,
+ "num_tokens": 11614714.0,
+ "step": 4140
+ },
+ {
+ "epoch": 9.97708082026538,
+ "eval_entropy": 0.24850971368926295,
+ "eval_loss": 0.8348681926727295,
+ "eval_mean_token_accuracy": 0.8453842609116201,
+ "eval_num_tokens": 11614714.0,
+ "eval_runtime": 55.1689,
+ "eval_samples_per_second": 25.739,
+ "eval_steps_per_second": 3.226,
+ "step": 4140
+ },
+ {
+ "epoch": 10.0,
+ "eval_entropy": 0.24856727210323462,
+ "eval_loss": 0.8349125981330872,
+ "eval_mean_token_accuracy": 0.845321658622013,
+ "eval_num_tokens": 11636880.0,
+ "eval_runtime": 55.1783,
+ "eval_samples_per_second": 25.735,
+ "eval_steps_per_second": 3.226,
+ "step": 4150
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.261265766344397e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..90a9c6e3444933b3c7e7ca9567aee009975a8146
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-420/trainer_state.json
@@ -0,0 +1,475 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0120627261761157,
+ "eval_steps": 20,
+ "global_step": 420,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.382273534580736e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba23f16881283d95f36108cf78155065e66a2ac7
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-440/trainer_state.json
@@ -0,0 +1,496 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.060313630880579,
+ "eval_steps": 20,
+ "global_step": 440,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.72996732017664e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..813641bb5522c0e32518f53c4956f38379bbbe9b
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-460/trainer_state.json
@@ -0,0 +1,517 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.1085645355850422,
+ "eval_steps": 20,
+ "global_step": 460,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 7.006665118169088e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fb3abae6eea8669a2c871680f0aed05f0b7d99c
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-480/trainer_state.json
@@ -0,0 +1,538 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.1568154402895054,
+ "eval_steps": 20,
+ "global_step": 480,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 7.315077038536704e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e51b9fdd3ea76614dfd44170dc1fab4fdf2a540a
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-500/trainer_state.json
@@ -0,0 +1,559 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.2050663449939687,
+ "eval_steps": 20,
+ "global_step": 500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 7.607755093530624e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fb50fe9c74fa4c5f633d0fa3b94b5b8b6971d3e
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-520/trainer_state.json
@@ -0,0 +1,580 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.2533172496984317,
+ "eval_steps": 20,
+ "global_step": 520,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 7.924245139408896e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f571fef1acb4185eff2bfc4ed4ca792f2bcb0983
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-540/trainer_state.json
@@ -0,0 +1,601 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.301568154402895,
+ "eval_steps": 20,
+ "global_step": 540,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 8.21236494928896e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..97544a70fc9fbd21e09e293463ad18beb39a77ef
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-560/trainer_state.json
@@ -0,0 +1,622 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.3498190591073582,
+ "eval_steps": 20,
+ "global_step": 560,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 8.496366499104768e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a620bdb656275804d094dd1cf94d256ed5b8a4d4
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-580/trainer_state.json
@@ -0,0 +1,643 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.3980699638118215,
+ "eval_steps": 20,
+ "global_step": 580,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 8.765813343479808e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..184c4bc7ecc4f784378b999f8dc59337df51e9f6
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-60/trainer_state.json
@@ -0,0 +1,97 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.14475271411338964,
+ "eval_steps": 20,
+ "global_step": 60,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9041516775260160.0,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d013c381f0359ab5f2f50d9de4dc29d05d8bc829
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-600/trainer_state.json
@@ -0,0 +1,664 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.4463208685162847,
+ "eval_steps": 20,
+ "global_step": 600,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.075386824378368e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..770899b33c6f0f688b9a7c7b7ab9d3a6676d5a2b
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-620/trainer_state.json
@@ -0,0 +1,685 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.494571773220748,
+ "eval_steps": 20,
+ "global_step": 620,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.386984236505088e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..460eb0a3357e956951ba9e93b0950e4c8e55493d
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-640/trainer_state.json
@@ -0,0 +1,706 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.5428226779252112,
+ "eval_steps": 20,
+ "global_step": 640,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.707029361584128e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8546d9c99f77a64072dcdaf9b2b7be8128059aab
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-660/trainer_state.json
@@ -0,0 +1,727 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.5910735826296745,
+ "eval_steps": 20,
+ "global_step": 660,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.980119282169856e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea6e763df688a23d907c3df6ff73ce9bc49e55df
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-680/trainer_state.json
@@ -0,0 +1,748 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.6393244873341375,
+ "eval_steps": 20,
+ "global_step": 680,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0277232386463744e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e884845471eab30b301e0fdb3b299fe37f4eb42
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-700/trainer_state.json
@@ -0,0 +1,769 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.6875753920386007,
+ "eval_steps": 20,
+ "global_step": 700,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0595024788088832e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1cd89aa7f4b3f153ddb97886abfb1a13113ac2b
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-720/trainer_state.json
@@ -0,0 +1,790 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.7358262967430638,
+ "eval_steps": 20,
+ "global_step": 720,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.0932475721730048e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..099490c231985176cc60491bf43a23915e0a5696
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-740/trainer_state.json
@@ -0,0 +1,811 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.784077201447527,
+ "eval_steps": 20,
+ "global_step": 740,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.1222584264034304e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cc5732a33fed3a1fb32e79319fe0ede732e4c11
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-760/trainer_state.json
@@ -0,0 +1,832 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.8323281061519903,
+ "eval_steps": 20,
+ "global_step": 760,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.1512446414710784e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4be3e98ced31815835d1721379b21388bf6da454
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-780/trainer_state.json
@@ -0,0 +1,853 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.8805790108564535,
+ "eval_steps": 20,
+ "global_step": 780,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.1817479249897472e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeabe043c7cd1b46bfad5a32f9f42aab80dc4994
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-80/trainer_state.json
@@ -0,0 +1,118 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.19300361881785283,
+ "eval_steps": 20,
+ "global_step": 80,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.195122590527488e+16,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..52d804d978766fcfd9d91001672cf3e385b5559c
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-800/trainer_state.json
@@ -0,0 +1,874 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.9288299155609168,
+ "eval_steps": 20,
+ "global_step": 800,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.2128478282356736e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e64b2b37b3bdc6bc8eb99a47435cf6a5416f2bae
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-820/trainer_state.json
@@ -0,0 +1,895 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.97708082026538,
+ "eval_steps": 20,
+ "global_step": 820,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.2408379171510272e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7e0202d5751b69438f8eaa92075e047a227809
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-840/trainer_state.json
@@ -0,0 +1,916 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0241254523522314,
+ "eval_steps": 20,
+ "global_step": 840,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.2729755251364352e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b8b329d58cd5ae23e21dc98ea213c32d10324d3
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-860/trainer_state.json
@@ -0,0 +1,937 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0723763570566947,
+ "eval_steps": 20,
+ "global_step": 860,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.3039258334654976e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa8a7b7661ff2212a0cfde7d18bc058981ff4a1f
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-880/trainer_state.json
@@ -0,0 +1,958 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.120627261761158,
+ "eval_steps": 20,
+ "global_step": 880,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.3343323202732544e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..40be38f6b2c755c2b88b10ac556d7dcfab2594df
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-900/trainer_state.json
@@ -0,0 +1,979 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.168878166465621,
+ "eval_steps": 20,
+ "global_step": 900,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.3670091299369472e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..891850448a21cda608d0e17083f5c46487d08b81
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-920/trainer_state.json
@@ -0,0 +1,1000 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.2171290711700844,
+ "eval_steps": 20,
+ "global_step": 920,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.399008362624256e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..364af36909ee2ab7acda1c5db08c26f8de783f37
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-940/trainer_state.json
@@ -0,0 +1,1021 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.2653799758745476,
+ "eval_steps": 20,
+ "global_step": 940,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.4265179878654464e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fde0cc5d08e2279da33811c776b71cfe829c1796
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-960/trainer_state.json
@@ -0,0 +1,1042 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.313630880579011,
+ "eval_steps": 20,
+ "global_step": 960,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.45455207528576e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1c84818e69f811f16f60142e1cc06e6613121
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/README.md
@@ -0,0 +1,209 @@
+---
+base_model: Qwen/Qwen3-4B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-4B-Base
+- lora
+- sft
+- transformers
+- trl
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.18.1
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1896e766820cec3cd6d668cb5738b3e383062
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/adapter_config.json
@@ -0,0 +1,46 @@
+{
+ "alora_invocation_tokens": null,
+ "alpha_pattern": {},
+ "arrow_config": null,
+ "auto_mapping": null,
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Base",
+ "bias": "none",
+ "corda_config": null,
+ "ensure_weight_tying": false,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.010466836799929592,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "peft_version": "0.18.1",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "o_proj",
+ "up_proj",
+ "gate_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj"
+ ],
+ "target_parameters": null,
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..699ff8df401fe4788525e9c1f9b86a99eadd6230
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/chat_template.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c960ecf0d33fd7b8c99d12680c0e74a82b36d446
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+ "add_prefix_space": false,
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "extra_special_tokens": [
+ "<|im_start|>",
+ "<|im_end|>",
+ "<|object_ref_start|>",
+ "<|object_ref_end|>",
+ "<|box_start|>",
+ "<|box_end|>",
+ "<|quad_start|>",
+ "<|quad_end|>",
+ "<|vision_start|>",
+ "<|vision_end|>",
+ "<|vision_pad|>",
+ "<|image_pad|>",
+ "<|video_pad|>"
+ ],
+ "is_local": false,
+ "model_max_length": 131072,
+ "pad_token": "<|endoftext|>",
+ "split_special_tokens": false,
+ "tokenizer_class": "Qwen2Tokenizer",
+ "unk_token": null
+}
diff --git a/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f15a6139e0305ce3a401671fa1d4de3f3729793
--- /dev/null
+++ b/overgeneralisation_code_Estonian/Qwen3-4B-Base_overgeneralisation_splits_code_features_train_overgeneralisation_splits_code_features_test1/checkpoint-980/trainer_state.json
@@ -0,0 +1,1063 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.361881785283474,
+ "eval_steps": 20,
+ "global_step": 980,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "entropy": 2.6116197913885117,
+ "epoch": 0.04825090470446321,
+ "grad_norm": 1.1876262426376343,
+ "learning_rate": 1.0857557926642104e-05,
+ "loss": 2.5340471267700195,
+ "mean_token_accuracy": 0.5611534088850021,
+ "num_tokens": 57775.0,
+ "step": 20
+ },
+ {
+ "epoch": 0.04825090470446321,
+ "eval_entropy": 2.590608079781693,
+ "eval_loss": 2.5703318119049072,
+ "eval_mean_token_accuracy": 0.564014436488741,
+ "eval_num_tokens": 57775.0,
+ "eval_runtime": 55.7007,
+ "eval_samples_per_second": 25.493,
+ "eval_steps_per_second": 3.196,
+ "step": 20
+ },
+ {
+ "entropy": 2.440599513053894,
+ "epoch": 0.09650180940892641,
+ "grad_norm": 0.6993194818496704,
+ "learning_rate": 2.22865662704759e-05,
+ "loss": 2.3795186996459963,
+ "mean_token_accuracy": 0.5858899913728237,
+ "num_tokens": 116416.0,
+ "step": 40
+ },
+ {
+ "epoch": 0.09650180940892641,
+ "eval_entropy": 2.2737208745452797,
+ "eval_loss": 2.2082855701446533,
+ "eval_mean_token_accuracy": 0.6068195993645807,
+ "eval_num_tokens": 116416.0,
+ "eval_runtime": 55.3085,
+ "eval_samples_per_second": 25.674,
+ "eval_steps_per_second": 3.218,
+ "step": 40
+ },
+ {
+ "entropy": 1.9993192434310914,
+ "epoch": 0.14475271411338964,
+ "grad_norm": 1.542217493057251,
+ "learning_rate": 3.37155746143097e-05,
+ "loss": 1.8905294418334961,
+ "mean_token_accuracy": 0.63250722438097,
+ "num_tokens": 173175.0,
+ "step": 60
+ },
+ {
+ "epoch": 0.14475271411338964,
+ "eval_entropy": 1.6427591362696015,
+ "eval_loss": 1.5519300699234009,
+ "eval_mean_token_accuracy": 0.6690792203619239,
+ "eval_num_tokens": 173175.0,
+ "eval_runtime": 55.3032,
+ "eval_samples_per_second": 25.677,
+ "eval_steps_per_second": 3.219,
+ "step": 60
+ },
+ {
+ "entropy": 1.3025753945112228,
+ "epoch": 0.19300361881785283,
+ "grad_norm": 1.1189110279083252,
+ "learning_rate": 4.514458295814349e-05,
+ "loss": 1.2444252967834473,
+ "mean_token_accuracy": 0.7070476695895195,
+ "num_tokens": 227007.0,
+ "step": 80
+ },
+ {
+ "epoch": 0.19300361881785283,
+ "eval_entropy": 1.047531166773164,
+ "eval_loss": 1.01227867603302,
+ "eval_mean_token_accuracy": 0.7457922345466828,
+ "eval_num_tokens": 227007.0,
+ "eval_runtime": 55.28,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 80
+ },
+ {
+ "entropy": 0.9481177270412445,
+ "epoch": 0.24125452352231605,
+ "grad_norm": 0.8431965112686157,
+ "learning_rate": 5.657359130197729e-05,
+ "loss": 0.9006875038146973,
+ "mean_token_accuracy": 0.7657369375228882,
+ "num_tokens": 285829.0,
+ "step": 100
+ },
+ {
+ "epoch": 0.24125452352231605,
+ "eval_entropy": 0.8651716802897078,
+ "eval_loss": 0.8829421997070312,
+ "eval_mean_token_accuracy": 0.768848463725508,
+ "eval_num_tokens": 285829.0,
+ "eval_runtime": 55.2839,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 100
+ },
+ {
+ "entropy": 0.8492659643292427,
+ "epoch": 0.28950542822677927,
+ "grad_norm": 1.3254178762435913,
+ "learning_rate": 6.800259964581109e-05,
+ "loss": 0.8246038436889649,
+ "mean_token_accuracy": 0.7797851234674453,
+ "num_tokens": 342830.0,
+ "step": 120
+ },
+ {
+ "epoch": 0.28950542822677927,
+ "eval_entropy": 0.8532627442579591,
+ "eval_loss": 0.8232717514038086,
+ "eval_mean_token_accuracy": 0.7797894574952929,
+ "eval_num_tokens": 342830.0,
+ "eval_runtime": 55.2655,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 120
+ },
+ {
+ "entropy": 0.8127924099564552,
+ "epoch": 0.33775633293124246,
+ "grad_norm": 1.236405849456787,
+ "learning_rate": 7.943160798964488e-05,
+ "loss": 0.7842514514923096,
+ "mean_token_accuracy": 0.784331226348877,
+ "num_tokens": 401404.0,
+ "step": 140
+ },
+ {
+ "epoch": 0.33775633293124246,
+ "eval_entropy": 0.8258234252420704,
+ "eval_loss": 0.7888523936271667,
+ "eval_mean_token_accuracy": 0.7806731449084335,
+ "eval_num_tokens": 401404.0,
+ "eval_runtime": 55.2732,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 140
+ },
+ {
+ "entropy": 0.7900068521499634,
+ "epoch": 0.38600723763570566,
+ "grad_norm": 1.213863492012024,
+ "learning_rate": 9.086061633347867e-05,
+ "loss": 0.7628804206848144,
+ "mean_token_accuracy": 0.7918283134698868,
+ "num_tokens": 455170.0,
+ "step": 160
+ },
+ {
+ "epoch": 0.38600723763570566,
+ "eval_entropy": 0.7623147050316414,
+ "eval_loss": 0.7571278810501099,
+ "eval_mean_token_accuracy": 0.7960286254293463,
+ "eval_num_tokens": 455170.0,
+ "eval_runtime": 55.2949,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 160
+ },
+ {
+ "entropy": 0.7525638103485107,
+ "epoch": 0.43425814234016885,
+ "grad_norm": 1.0159046649932861,
+ "learning_rate": 0.00010228962467731246,
+ "loss": 0.720798110961914,
+ "mean_token_accuracy": 0.8003803327679634,
+ "num_tokens": 513433.0,
+ "step": 180
+ },
+ {
+ "epoch": 0.43425814234016885,
+ "eval_entropy": 0.7300447925422968,
+ "eval_loss": 0.7215597033500671,
+ "eval_mean_token_accuracy": 0.8043364601188832,
+ "eval_num_tokens": 513433.0,
+ "eval_runtime": 55.235,
+ "eval_samples_per_second": 25.708,
+ "eval_steps_per_second": 3.223,
+ "step": 180
+ },
+ {
+ "entropy": 0.7320861473679543,
+ "epoch": 0.4825090470446321,
+ "grad_norm": 0.9363995790481567,
+ "learning_rate": 0.00011371863302114625,
+ "loss": 0.6982485771179199,
+ "mean_token_accuracy": 0.8094118356704711,
+ "num_tokens": 572252.0,
+ "step": 200
+ },
+ {
+ "epoch": 0.4825090470446321,
+ "eval_entropy": 0.7166647703460093,
+ "eval_loss": 0.7090815305709839,
+ "eval_mean_token_accuracy": 0.8082149638218826,
+ "eval_num_tokens": 572252.0,
+ "eval_runtime": 55.2736,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 200
+ },
+ {
+ "entropy": 0.7224139869213104,
+ "epoch": 0.5307599517490953,
+ "grad_norm": 0.8827612996101379,
+ "learning_rate": 0.00012514764136498005,
+ "loss": 0.6893723487854004,
+ "mean_token_accuracy": 0.8103836163878441,
+ "num_tokens": 626415.0,
+ "step": 220
+ },
+ {
+ "epoch": 0.5307599517490953,
+ "eval_entropy": 0.7198703587055206,
+ "eval_loss": 0.6869194507598877,
+ "eval_mean_token_accuracy": 0.8128652288002914,
+ "eval_num_tokens": 626415.0,
+ "eval_runtime": 55.224,
+ "eval_samples_per_second": 25.713,
+ "eval_steps_per_second": 3.223,
+ "step": 220
+ },
+ {
+ "entropy": 0.6983707025647163,
+ "epoch": 0.5790108564535585,
+ "grad_norm": 0.9153295159339905,
+ "learning_rate": 0.00013657664970881386,
+ "loss": 0.6662442207336425,
+ "mean_token_accuracy": 0.8164364024996758,
+ "num_tokens": 681802.0,
+ "step": 240
+ },
+ {
+ "epoch": 0.5790108564535585,
+ "eval_entropy": 0.729364558886946,
+ "eval_loss": 0.67768794298172,
+ "eval_mean_token_accuracy": 0.8129584832807605,
+ "eval_num_tokens": 681802.0,
+ "eval_runtime": 55.2739,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 240
+ },
+ {
+ "entropy": 0.6905479088425637,
+ "epoch": 0.6272617611580217,
+ "grad_norm": 0.6342670321464539,
+ "learning_rate": 0.00014800565805264765,
+ "loss": 0.6569931507110596,
+ "mean_token_accuracy": 0.8193552777171135,
+ "num_tokens": 734912.0,
+ "step": 260
+ },
+ {
+ "epoch": 0.6272617611580217,
+ "eval_entropy": 0.6885626697808169,
+ "eval_loss": 0.663456380367279,
+ "eval_mean_token_accuracy": 0.8180048070596845,
+ "eval_num_tokens": 734912.0,
+ "eval_runtime": 55.2691,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 260
+ },
+ {
+ "entropy": 0.6714175209403038,
+ "epoch": 0.6755126658624849,
+ "grad_norm": 0.6685030460357666,
+ "learning_rate": 0.00015943466639648145,
+ "loss": 0.6428029060363769,
+ "mean_token_accuracy": 0.8227180182933808,
+ "num_tokens": 794671.0,
+ "step": 280
+ },
+ {
+ "epoch": 0.6755126658624849,
+ "eval_entropy": 0.70999454949679,
+ "eval_loss": 0.6489622592926025,
+ "eval_mean_token_accuracy": 0.8230955952338959,
+ "eval_num_tokens": 794671.0,
+ "eval_runtime": 55.3103,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 280
+ },
+ {
+ "entropy": 0.6617054045200348,
+ "epoch": 0.7237635705669482,
+ "grad_norm": 0.7328742742538452,
+ "learning_rate": 0.00017086367474031526,
+ "loss": 0.6275388717651367,
+ "mean_token_accuracy": 0.8258797079324722,
+ "num_tokens": 851166.0,
+ "step": 300
+ },
+ {
+ "epoch": 0.7237635705669482,
+ "eval_entropy": 0.676628519644898,
+ "eval_loss": 0.6419883370399475,
+ "eval_mean_token_accuracy": 0.8190852996338619,
+ "eval_num_tokens": 851166.0,
+ "eval_runtime": 55.3104,
+ "eval_samples_per_second": 25.673,
+ "eval_steps_per_second": 3.218,
+ "step": 300
+ },
+ {
+ "entropy": 0.667515504360199,
+ "epoch": 0.7720144752714113,
+ "grad_norm": 0.6317798495292664,
+ "learning_rate": 0.00018229268308414903,
+ "loss": 0.6375722408294677,
+ "mean_token_accuracy": 0.823223651945591,
+ "num_tokens": 907742.0,
+ "step": 320
+ },
+ {
+ "epoch": 0.7720144752714113,
+ "eval_entropy": 0.7044156696019548,
+ "eval_loss": 0.6329143643379211,
+ "eval_mean_token_accuracy": 0.8259416510549824,
+ "eval_num_tokens": 907742.0,
+ "eval_runtime": 55.2563,
+ "eval_samples_per_second": 25.698,
+ "eval_steps_per_second": 3.221,
+ "step": 320
+ },
+ {
+ "entropy": 0.6768256008625031,
+ "epoch": 0.8202653799758746,
+ "grad_norm": 0.5729309916496277,
+ "learning_rate": 0.00019372169142798285,
+ "loss": 0.6379447937011719,
+ "mean_token_accuracy": 0.8250771954655647,
+ "num_tokens": 967618.0,
+ "step": 340
+ },
+ {
+ "epoch": 0.8202653799758746,
+ "eval_entropy": 0.6505340352821886,
+ "eval_loss": 0.6226425170898438,
+ "eval_mean_token_accuracy": 0.8273337463314614,
+ "eval_num_tokens": 967618.0,
+ "eval_runtime": 55.2588,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 340
+ },
+ {
+ "entropy": 0.6730180442333221,
+ "epoch": 0.8685162846803377,
+ "grad_norm": 0.5941824316978455,
+ "learning_rate": 0.00020515069977181664,
+ "loss": 0.6342682361602783,
+ "mean_token_accuracy": 0.8241883024573327,
+ "num_tokens": 1018333.0,
+ "step": 360
+ },
+ {
+ "epoch": 0.8685162846803377,
+ "eval_entropy": 0.6627941898415598,
+ "eval_loss": 0.6253094673156738,
+ "eval_mean_token_accuracy": 0.8266657035002548,
+ "eval_num_tokens": 1018333.0,
+ "eval_runtime": 55.2725,
+ "eval_samples_per_second": 25.691,
+ "eval_steps_per_second": 3.22,
+ "step": 360
+ },
+ {
+ "entropy": 0.6687290579080581,
+ "epoch": 0.916767189384801,
+ "grad_norm": 0.6753661036491394,
+ "learning_rate": 0.00021657970811565043,
+ "loss": 0.6109315872192382,
+ "mean_token_accuracy": 0.8278054997324944,
+ "num_tokens": 1069111.0,
+ "step": 380
+ },
+ {
+ "epoch": 0.916767189384801,
+ "eval_entropy": 0.6449436111731476,
+ "eval_loss": 0.6159152984619141,
+ "eval_mean_token_accuracy": 0.8295929308017987,
+ "eval_num_tokens": 1069111.0,
+ "eval_runtime": 55.2434,
+ "eval_samples_per_second": 25.704,
+ "eval_steps_per_second": 3.222,
+ "step": 380
+ },
+ {
+ "entropy": 0.6549551770091057,
+ "epoch": 0.9650180940892642,
+ "grad_norm": 0.6604854464530945,
+ "learning_rate": 0.00022800871645948422,
+ "loss": 0.6144959926605225,
+ "mean_token_accuracy": 0.8307996809482574,
+ "num_tokens": 1123008.0,
+ "step": 400
+ },
+ {
+ "epoch": 0.9650180940892642,
+ "eval_entropy": 0.6285942687412326,
+ "eval_loss": 0.6156108975410461,
+ "eval_mean_token_accuracy": 0.8301522892512633,
+ "eval_num_tokens": 1123008.0,
+ "eval_runtime": 55.2964,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 400
+ },
+ {
+ "entropy": 0.6362387537956238,
+ "epoch": 1.0120627261761157,
+ "grad_norm": 0.6493216156959534,
+ "learning_rate": 0.00023715125200746723,
+ "loss": 0.5960193634033203,
+ "mean_token_accuracy": 0.831819243920155,
+ "num_tokens": 1177682.0,
+ "step": 420
+ },
+ {
+ "epoch": 1.0120627261761157,
+ "eval_entropy": 0.6311206628432434,
+ "eval_loss": 0.6114970445632935,
+ "eval_mean_token_accuracy": 0.8305217643802085,
+ "eval_num_tokens": 1177682.0,
+ "eval_runtime": 55.2941,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 420
+ },
+ {
+ "entropy": 0.6032533653080463,
+ "epoch": 1.060313630880579,
+ "grad_norm": 0.5768770575523376,
+ "learning_rate": 0.0002371277633572037,
+ "loss": 0.570250129699707,
+ "mean_token_accuracy": 0.8371127635240555,
+ "num_tokens": 1238001.0,
+ "step": 440
+ },
+ {
+ "epoch": 1.060313630880579,
+ "eval_entropy": 0.5940044235982253,
+ "eval_loss": 0.6010516285896301,
+ "eval_mean_token_accuracy": 0.8339078124989284,
+ "eval_num_tokens": 1238001.0,
+ "eval_runtime": 55.3141,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 440
+ },
+ {
+ "entropy": 0.6125824645161628,
+ "epoch": 1.1085645355850422,
+ "grad_norm": 0.5974647402763367,
+ "learning_rate": 0.00023707072594936633,
+ "loss": 0.5693985462188721,
+ "mean_token_accuracy": 0.8372680127620697,
+ "num_tokens": 1294494.0,
+ "step": 460
+ },
+ {
+ "epoch": 1.1085645355850422,
+ "eval_entropy": 0.6013931218492851,
+ "eval_loss": 0.6000593900680542,
+ "eval_mean_token_accuracy": 0.8335993320084689,
+ "eval_num_tokens": 1294494.0,
+ "eval_runtime": 55.3351,
+ "eval_samples_per_second": 25.662,
+ "eval_steps_per_second": 3.217,
+ "step": 460
+ },
+ {
+ "entropy": 0.6020087823271751,
+ "epoch": 1.1568154402895054,
+ "grad_norm": 0.5234053134918213,
+ "learning_rate": 0.00023698015592486674,
+ "loss": 0.5627901554107666,
+ "mean_token_accuracy": 0.8367442533373832,
+ "num_tokens": 1352928.0,
+ "step": 480
+ },
+ {
+ "epoch": 1.1568154402895054,
+ "eval_entropy": 0.6235448820202538,
+ "eval_loss": 0.5950364470481873,
+ "eval_mean_token_accuracy": 0.8320885056190277,
+ "eval_num_tokens": 1352928.0,
+ "eval_runtime": 55.2811,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 480
+ },
+ {
+ "entropy": 0.6143898621201516,
+ "epoch": 1.2050663449939687,
+ "grad_norm": 0.606967031955719,
+ "learning_rate": 0.00023685607891395062,
+ "loss": 0.5775506019592285,
+ "mean_token_accuracy": 0.8369954064488411,
+ "num_tokens": 1406356.0,
+ "step": 500
+ },
+ {
+ "epoch": 1.2050663449939687,
+ "eval_entropy": 0.6467455027813322,
+ "eval_loss": 0.5900489091873169,
+ "eval_mean_token_accuracy": 0.8357991258080086,
+ "eval_num_tokens": 1406356.0,
+ "eval_runtime": 55.2709,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 500
+ },
+ {
+ "entropy": 0.6138403750956059,
+ "epoch": 1.2533172496984317,
+ "grad_norm": 0.6079438328742981,
+ "learning_rate": 0.00023669853002894432,
+ "loss": 0.5609864711761474,
+ "mean_token_accuracy": 0.8403474077582359,
+ "num_tokens": 1464150.0,
+ "step": 520
+ },
+ {
+ "epoch": 1.2533172496984317,
+ "eval_entropy": 0.6021975442934572,
+ "eval_loss": 0.5859882831573486,
+ "eval_mean_token_accuracy": 0.8359727501199486,
+ "eval_num_tokens": 1464150.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 520
+ },
+ {
+ "entropy": 0.6103575885295868,
+ "epoch": 1.301568154402895,
+ "grad_norm": 0.576837420463562,
+ "learning_rate": 0.00023650755385431872,
+ "loss": 0.5683893680572509,
+ "mean_token_accuracy": 0.8384664133191109,
+ "num_tokens": 1521444.0,
+ "step": 540
+ },
+ {
+ "epoch": 1.301568154402895,
+ "eval_entropy": 0.5911824041872882,
+ "eval_loss": 0.5858258605003357,
+ "eval_mean_token_accuracy": 0.8374535170164001,
+ "eval_num_tokens": 1521444.0,
+ "eval_runtime": 55.2523,
+ "eval_samples_per_second": 25.7,
+ "eval_steps_per_second": 3.222,
+ "step": 540
+ },
+ {
+ "entropy": 0.6077366039156914,
+ "epoch": 1.3498190591073582,
+ "grad_norm": 0.5415759086608887,
+ "learning_rate": 0.00023628320443407213,
+ "loss": 0.563680362701416,
+ "mean_token_accuracy": 0.8382868468761444,
+ "num_tokens": 1576872.0,
+ "step": 560
+ },
+ {
+ "epoch": 1.3498190591073582,
+ "eval_entropy": 0.6217049782195788,
+ "eval_loss": 0.579742431640625,
+ "eval_mean_token_accuracy": 0.8369964655865444,
+ "eval_num_tokens": 1576872.0,
+ "eval_runtime": 55.2863,
+ "eval_samples_per_second": 25.684,
+ "eval_steps_per_second": 3.22,
+ "step": 560
+ },
+ {
+ "entropy": 0.5980370678007603,
+ "epoch": 1.3980699638118215,
+ "grad_norm": 0.5197780132293701,
+ "learning_rate": 0.00023602554525643677,
+ "loss": 0.5522702217102051,
+ "mean_token_accuracy": 0.8421810269355774,
+ "num_tokens": 1627009.0,
+ "step": 580
+ },
+ {
+ "epoch": 1.3980699638118215,
+ "eval_entropy": 0.6227023576417666,
+ "eval_loss": 0.5810565948486328,
+ "eval_mean_token_accuracy": 0.838038090909465,
+ "eval_num_tokens": 1627009.0,
+ "eval_runtime": 55.2757,
+ "eval_samples_per_second": 25.689,
+ "eval_steps_per_second": 3.22,
+ "step": 580
+ },
+ {
+ "entropy": 0.6068567186594009,
+ "epoch": 1.4463208685162847,
+ "grad_norm": 0.5295549035072327,
+ "learning_rate": 0.00023573464923591205,
+ "loss": 0.5554513931274414,
+ "mean_token_accuracy": 0.8414245262742043,
+ "num_tokens": 1682323.0,
+ "step": 600
+ },
+ {
+ "epoch": 1.4463208685162847,
+ "eval_entropy": 0.6000609700934271,
+ "eval_loss": 0.5768566131591797,
+ "eval_mean_token_accuracy": 0.839005763611097,
+ "eval_num_tokens": 1682323.0,
+ "eval_runtime": 55.2665,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 600
+ },
+ {
+ "entropy": 0.5905646570026875,
+ "epoch": 1.494571773220748,
+ "grad_norm": 0.5507094264030457,
+ "learning_rate": 0.00023541059869263081,
+ "loss": 0.5510271549224853,
+ "mean_token_accuracy": 0.8407903507351875,
+ "num_tokens": 1738743.0,
+ "step": 620
+ },
+ {
+ "epoch": 1.494571773220748,
+ "eval_entropy": 0.6138454079627991,
+ "eval_loss": 0.5707022547721863,
+ "eval_mean_token_accuracy": 0.8407475007384011,
+ "eval_num_tokens": 1738743.0,
+ "eval_runtime": 55.2612,
+ "eval_samples_per_second": 25.696,
+ "eval_steps_per_second": 3.221,
+ "step": 620
+ },
+ {
+ "entropy": 0.5900103107094765,
+ "epoch": 1.5428226779252112,
+ "grad_norm": 0.5121804475784302,
+ "learning_rate": 0.00023505348532906368,
+ "loss": 0.5467266082763672,
+ "mean_token_accuracy": 0.8411859899759293,
+ "num_tokens": 1794501.0,
+ "step": 640
+ },
+ {
+ "epoch": 1.5428226779252112,
+ "eval_entropy": 0.6089411131786496,
+ "eval_loss": 0.5724870562553406,
+ "eval_mean_token_accuracy": 0.8386286386613095,
+ "eval_num_tokens": 1794501.0,
+ "eval_runtime": 55.265,
+ "eval_samples_per_second": 25.694,
+ "eval_steps_per_second": 3.221,
+ "step": 640
+ },
+ {
+ "entropy": 0.5756346069276332,
+ "epoch": 1.5910735826296745,
+ "grad_norm": 0.5472006797790527,
+ "learning_rate": 0.00023466341020406828,
+ "loss": 0.5396484375,
+ "mean_token_accuracy": 0.8448511779308319,
+ "num_tokens": 1850301.0,
+ "step": 660
+ },
+ {
+ "epoch": 1.5910735826296745,
+ "eval_entropy": 0.5886324905277638,
+ "eval_loss": 0.5662708282470703,
+ "eval_mean_token_accuracy": 0.8412756119551283,
+ "eval_num_tokens": 1850301.0,
+ "eval_runtime": 55.2744,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 660
+ },
+ {
+ "entropy": 0.5833481803536416,
+ "epoch": 1.6393244873341375,
+ "grad_norm": 0.47462818026542664,
+ "learning_rate": 0.0002342404837042908,
+ "loss": 0.5445642471313477,
+ "mean_token_accuracy": 0.8443691149353981,
+ "num_tokens": 1905390.0,
+ "step": 680
+ },
+ {
+ "epoch": 1.6393244873341375,
+ "eval_entropy": 0.5821995529231061,
+ "eval_loss": 0.5642583966255188,
+ "eval_mean_token_accuracy": 0.8412673570466845,
+ "eval_num_tokens": 1905390.0,
+ "eval_runtime": 55.2999,
+ "eval_samples_per_second": 25.678,
+ "eval_steps_per_second": 3.219,
+ "step": 680
+ },
+ {
+ "entropy": 0.5866442531347275,
+ "epoch": 1.6875753920386007,
+ "grad_norm": 0.4118373990058899,
+ "learning_rate": 0.00023378482551292802,
+ "loss": 0.5519282341003418,
+ "mean_token_accuracy": 0.8430693671107292,
+ "num_tokens": 1963425.0,
+ "step": 700
+ },
+ {
+ "epoch": 1.6875753920386007,
+ "eval_entropy": 0.5955309136195129,
+ "eval_loss": 0.5613821744918823,
+ "eval_mean_token_accuracy": 0.8411754523770193,
+ "eval_num_tokens": 1963425.0,
+ "eval_runtime": 55.2957,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 700
+ },
+ {
+ "entropy": 0.5837626487016678,
+ "epoch": 1.7358262967430638,
+ "grad_norm": 0.41595396399497986,
+ "learning_rate": 0.00023329656457585815,
+ "loss": 0.5490932464599609,
+ "mean_token_accuracy": 0.8433916479349136,
+ "num_tokens": 2024685.0,
+ "step": 720
+ },
+ {
+ "epoch": 1.7358262967430638,
+ "eval_entropy": 0.5821482105536407,
+ "eval_loss": 0.558079719543457,
+ "eval_mean_token_accuracy": 0.8422878351104394,
+ "eval_num_tokens": 2024685.0,
+ "eval_runtime": 55.2982,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 720
+ },
+ {
+ "entropy": 0.5775028631091118,
+ "epoch": 1.784077201447527,
+ "grad_norm": 0.5807880759239197,
+ "learning_rate": 0.00023277583906515078,
+ "loss": 0.5457483768463135,
+ "mean_token_accuracy": 0.8433813333511353,
+ "num_tokens": 2083327.0,
+ "step": 740
+ },
+ {
+ "epoch": 1.784077201447527,
+ "eval_entropy": 0.5907773069116506,
+ "eval_loss": 0.5575461387634277,
+ "eval_mean_token_accuracy": 0.8427480385544595,
+ "eval_num_tokens": 2083327.0,
+ "eval_runtime": 55.2977,
+ "eval_samples_per_second": 25.679,
+ "eval_steps_per_second": 3.219,
+ "step": 740
+ },
+ {
+ "entropy": 0.5845035955309867,
+ "epoch": 1.8323281061519903,
+ "grad_norm": 0.7579953670501709,
+ "learning_rate": 0.0002322227963399659,
+ "loss": 0.534868860244751,
+ "mean_token_accuracy": 0.8444704949855805,
+ "num_tokens": 2136500.0,
+ "step": 760
+ },
+ {
+ "epoch": 1.8323281061519903,
+ "eval_entropy": 0.5573678276177203,
+ "eval_loss": 0.5549466609954834,
+ "eval_mean_token_accuracy": 0.8444588984666246,
+ "eval_num_tokens": 2136500.0,
+ "eval_runtime": 55.2935,
+ "eval_samples_per_second": 25.681,
+ "eval_steps_per_second": 3.219,
+ "step": 760
+ },
+ {
+ "entropy": 0.5598218090832233,
+ "epoch": 1.8805790108564535,
+ "grad_norm": 0.48678866028785706,
+ "learning_rate": 0.00023163759290485277,
+ "loss": 0.5248189449310303,
+ "mean_token_accuracy": 0.8493345126509666,
+ "num_tokens": 2192762.0,
+ "step": 780
+ },
+ {
+ "epoch": 1.8805790108564535,
+ "eval_entropy": 0.571697450923116,
+ "eval_loss": 0.553294837474823,
+ "eval_mean_token_accuracy": 0.8445497747887386,
+ "eval_num_tokens": 2192762.0,
+ "eval_runtime": 55.3123,
+ "eval_samples_per_second": 25.672,
+ "eval_steps_per_second": 3.218,
+ "step": 780
+ },
+ {
+ "entropy": 0.5655641779303551,
+ "epoch": 1.9288299155609168,
+ "grad_norm": 0.46138879656791687,
+ "learning_rate": 0.0002310203943654614,
+ "loss": 0.5277577877044678,
+ "mean_token_accuracy": 0.8480132848024369,
+ "num_tokens": 2249198.0,
+ "step": 800
+ },
+ {
+ "epoch": 1.9288299155609168,
+ "eval_entropy": 0.5671831344285708,
+ "eval_loss": 0.549659788608551,
+ "eval_mean_token_accuracy": 0.8456521740790164,
+ "eval_num_tokens": 2249198.0,
+ "eval_runtime": 55.2817,
+ "eval_samples_per_second": 25.687,
+ "eval_steps_per_second": 3.22,
+ "step": 800
+ },
+ {
+ "entropy": 0.5706607647240162,
+ "epoch": 1.97708082026538,
+ "grad_norm": 0.5946080088615417,
+ "learning_rate": 0.00023037137538167756,
+ "loss": 0.5285571098327637,
+ "mean_token_accuracy": 0.8492968618869782,
+ "num_tokens": 2302801.0,
+ "step": 820
+ },
+ {
+ "epoch": 1.97708082026538,
+ "eval_entropy": 0.5583421492509628,
+ "eval_loss": 0.5508614778518677,
+ "eval_mean_token_accuracy": 0.8449724641408813,
+ "eval_num_tokens": 2302801.0,
+ "eval_runtime": 55.2922,
+ "eval_samples_per_second": 25.682,
+ "eval_steps_per_second": 3.219,
+ "step": 820
+ },
+ {
+ "entropy": 0.5352686597750738,
+ "epoch": 2.0241254523522314,
+ "grad_norm": 0.517436146736145,
+ "learning_rate": 0.00022969071961819653,
+ "loss": 0.4967633247375488,
+ "mean_token_accuracy": 0.8539289052669818,
+ "num_tokens": 2358124.0,
+ "step": 840
+ },
+ {
+ "epoch": 2.0241254523522314,
+ "eval_entropy": 0.5460858092214285,
+ "eval_loss": 0.5541515946388245,
+ "eval_mean_token_accuracy": 0.8444636634896311,
+ "eval_num_tokens": 2358124.0,
+ "eval_runtime": 55.2792,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 840
+ },
+ {
+ "entropy": 0.5349089443683624,
+ "epoch": 2.0723763570566947,
+ "grad_norm": 0.48389914631843567,
+ "learning_rate": 0.00022897861969254802,
+ "loss": 0.4880528450012207,
+ "mean_token_accuracy": 0.8565173164010048,
+ "num_tokens": 2415855.0,
+ "step": 860
+ },
+ {
+ "epoch": 2.0723763570566947,
+ "eval_entropy": 0.5478990998830688,
+ "eval_loss": 0.5504991412162781,
+ "eval_mean_token_accuracy": 0.8451762336693452,
+ "eval_num_tokens": 2415855.0,
+ "eval_runtime": 55.274,
+ "eval_samples_per_second": 25.69,
+ "eval_steps_per_second": 3.22,
+ "step": 860
+ },
+ {
+ "entropy": 0.5228772208094596,
+ "epoch": 2.120627261761158,
+ "grad_norm": 0.6992365121841431,
+ "learning_rate": 0.00022823527712058763,
+ "loss": 0.47493491172790525,
+ "mean_token_accuracy": 0.8592865198850632,
+ "num_tokens": 2470986.0,
+ "step": 880
+ },
+ {
+ "epoch": 2.120627261761158,
+ "eval_entropy": 0.5613854529147737,
+ "eval_loss": 0.5500661730766296,
+ "eval_mean_token_accuracy": 0.8454208903098375,
+ "eval_num_tokens": 2470986.0,
+ "eval_runtime": 55.2832,
+ "eval_samples_per_second": 25.686,
+ "eval_steps_per_second": 3.22,
+ "step": 880
+ },
+ {
+ "entropy": 0.5406626127660275,
+ "epoch": 2.168878166465621,
+ "grad_norm": 0.5532678365707397,
+ "learning_rate": 0.00022746090225947036,
+ "loss": 0.49279079437255857,
+ "mean_token_accuracy": 0.8531969726085663,
+ "num_tokens": 2531414.0,
+ "step": 900
+ },
+ {
+ "epoch": 2.168878166465621,
+ "eval_entropy": 0.5460681235522367,
+ "eval_loss": 0.5487214922904968,
+ "eval_mean_token_accuracy": 0.846246014820056,
+ "eval_num_tokens": 2531414.0,
+ "eval_runtime": 55.2597,
+ "eval_samples_per_second": 25.697,
+ "eval_steps_per_second": 3.221,
+ "step": 900
+ },
+ {
+ "entropy": 0.5183610402047634,
+ "epoch": 2.2171290711700844,
+ "grad_norm": 0.4878793954849243,
+ "learning_rate": 0.0002266557142481219,
+ "loss": 0.47671008110046387,
+ "mean_token_accuracy": 0.8574348524212837,
+ "num_tokens": 2593991.0,
+ "step": 920
+ },
+ {
+ "epoch": 2.2171290711700844,
+ "eval_entropy": 0.5519453509804908,
+ "eval_loss": 0.5485875606536865,
+ "eval_mean_token_accuracy": 0.8460252378763777,
+ "eval_num_tokens": 2593991.0,
+ "eval_runtime": 55.2689,
+ "eval_samples_per_second": 25.693,
+ "eval_steps_per_second": 3.221,
+ "step": 920
+ },
+ {
+ "entropy": 0.5451609842479229,
+ "epoch": 2.2653799758745476,
+ "grad_norm": 0.48667117953300476,
+ "learning_rate": 0.00022581994094522502,
+ "loss": 0.4859492301940918,
+ "mean_token_accuracy": 0.8560267508029937,
+ "num_tokens": 2643389.0,
+ "step": 940
+ },
+ {
+ "epoch": 2.2653799758745476,
+ "eval_entropy": 0.535621813341473,
+ "eval_loss": 0.5473203063011169,
+ "eval_mean_token_accuracy": 0.8467726958601662,
+ "eval_num_tokens": 2643389.0,
+ "eval_runtime": 55.2696,
+ "eval_samples_per_second": 25.692,
+ "eval_steps_per_second": 3.221,
+ "step": 940
+ },
+ {
+ "entropy": 0.5411449268460273,
+ "epoch": 2.313630880579011,
+ "grad_norm": 0.415181964635849,
+ "learning_rate": 0.0002249538188647382,
+ "loss": 0.49844727516174314,
+ "mean_token_accuracy": 0.8529795065522194,
+ "num_tokens": 2697757.0,
+ "step": 960
+ },
+ {
+ "epoch": 2.313630880579011,
+ "eval_entropy": 0.5490157793412048,
+ "eval_loss": 0.5466434955596924,
+ "eval_mean_token_accuracy": 0.8461743238266934,
+ "eval_num_tokens": 2697757.0,
+ "eval_runtime": 55.2795,
+ "eval_samples_per_second": 25.688,
+ "eval_steps_per_second": 3.22,
+ "step": 960
+ },
+ {
+ "entropy": 0.5334192402660847,
+ "epoch": 2.361881785283474,
+ "grad_norm": 0.4685683250427246,
+ "learning_rate": 0.000224057593108965,
+ "loss": 0.4855259895324707,
+ "mean_token_accuracy": 0.8572103619575501,
+ "num_tokens": 2752288.0,
+ "step": 980
+ },
+ {
+ "epoch": 2.361881785283474,
+ "eval_entropy": 0.5256538374370403,
+ "eval_loss": 0.5443126559257507,
+ "eval_mean_token_accuracy": 0.8462888333904609,
+ "eval_num_tokens": 2752288.0,
+ "eval_runtime": 55.2968,
+ "eval_samples_per_second": 25.68,
+ "eval_steps_per_second": 3.219,
+ "step": 980
+ }
+ ],
+ "logging_steps": 20,
+ "max_steps": 4150,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 10,
+ "save_steps": 20,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.4843883414692352e+17,
+ "train_batch_size": 8,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1020/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1040/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1060/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1080/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1120/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1140/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1160/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1180/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-120/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1220/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1240/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1260/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1280/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1320/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1340/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1360/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1380/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-140/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1420/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1440/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1460/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1480/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1520/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1540/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..508542353959119026bbc909537da99ec4f96c80
--- /dev/null
+++ b/overgeneralisation_original_Swedish/Qwen3-4B-Base_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test2/checkpoint-1560/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55912772fc712471b50a2b4ec99ff4f5525649a1f3027d3286aa8ab362407696
+size 6033
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b0c90d4dbe31bb6a72678e7a58829936e1617019
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbc0465e8a3e31834d5581e6d5ad0111152eeeff8261ecf97477bc6560ad49a0
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1266/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5bda4f8538992e387e081475628655eb37f197fb
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98f955c8b28f525019d2314875439600a23046153ef0406f189759423adb2369
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1688/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3beac069785c50f17db47944a931cb0bf16039cf
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2da87972606715aa14afa74bbb643282759cd29a58c8daf1d99c5c2a428eb740
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2110/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a12eba43304212f561a144c6f9474d5cf17f83ca
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2e4095c03117a779057d4dad387e1f64fde6c43b0dc6703dc47091a6dad4984
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2532/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2bed5e406632ba311db684d4f21f1b9f9dac942d
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df77e14f8eae9e8919861d0bd3fe77c7a7c2801b84cef67a910a8c9d4d5a3f9a
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2954/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8f3ef2251dcf97eb864a974a0a186c4fbdf77439
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ac17bf5a18a3ab69e42da7dcb73ad094de4c08046a089e3ab3b043bfea9a7a2
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3376/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4722b7dc87a71d46cacffc47b9b4f6a6ce6b5f82
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2ff0278a7e4a91bc38e5aa28c47097a32de7a907d9019095409330a4bfaaf4
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3798/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ea15347bed9dd0961cb18242b8e9db86f300718
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02b485400e995a42cb23ef66e33c7a12b156d4b4374e4aeac310474ae875c846
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-422/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..669f07e27c1a2a2ce525bfc9b0ec6ce399987e88
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb6dfd27d1eb005e985b111f3fb3242204aeee1ce3ea2dfb79de157e5dc509cc
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4220/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed21afcb8026f0f385c8f859fcaf9f6af359532d
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f6cba1f2575e88bbf364498e5cf0dd15841b60860b350dad3aef4f9be3a64cf
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99efd2efd190923f6b26ead2efb08510045ea154
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-844/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95177719630e08936790f1adeea112eef415eded432f4d94d97b1825873aee7c
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b98227c79a177ecbd1ceaae2a71dde1c990af409
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f2f39d2ae0f5e794676bdddadd58765bc0813b4069e515cb2476aa18eba4996
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1266/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1ad1f555b029ef4c1e0a6d9335a17d3170592384
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4804e08ee28a11ff0b5d7b95733e4feb359a6801a1d9bd972df3f8c506c3f37
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1688/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..944de5a919a1cae75daafaab5d0a8b978c19a711
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c77c26855b7a49b60e247e7a3ecb8e3d704b197262872f15c9facbca0cb227b1
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2110/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f742b23bacd8f72d8be841693b1b1df3ff318b12
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c401c5fba74635a8b3a64d2ec7f449afcbf1b7b94689c7cc2ecba9f1f636d241
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2532/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4dc51e5628ae4eb01a4f0ec237ea51060ca97c88
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3af25f716a51726569c06809284291bd27d42db4003364d44bebbf3a1ce55342
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2954/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5232ac8f61379fa08a0714376988404528e95906
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbef709b106a321383a66ec14cc970cdf7eb2b7b748bfe3905e31524c09a4eef
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3376/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ff63a320f84be99df499f50274ce5f2c81b3534
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c54cf6e72e7c873d53b7617c915432ac66f7c63777f889def5c57bfbf1e6a074
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3798/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..12ec2005856a8c953b1e6f82e4ed2a5a0717649e
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7311f2d0bb6d2db2650faf6cc04b7562a8391ee347739da749527b56b04eaf99
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-422/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d85ba9879984d9921806478f46c82e3138f0c2d
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09621bae85f1c6494b2b3b15c3b968315f14bc41e256567b729af19c7e3f5cb6
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4220/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0bf5570f0cd7ca65bac64058da3a08cb28823e82
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbc05d8fe4ac51af63319fef49b05f2bb4f32a2a8108268d5425a7588c7175e3
+size 1057033224
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20ebe0d32e9bd2cc668073dacd5148ca89c78625
--- /dev/null
+++ b/productivity_code_Estonian/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-844/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df2bed10826027e1bb1376eeab6208c1e472c43974a3c94fd2b666987533ffb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce428b1631fad3e767ca430b23d1ef43b10e3864
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:486dd29b5f6b93bc43f1b9af9e7ce4c81dffa071f4f1f907f7a0b40c2545430a
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1248/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f3a322cc4eb0510ff5cb9385d917b76db1fdbac
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5d17e8a5e5422b0e365731f316a518dc79694bf3de26efd8b296dbbc2b69303
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-1664/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..db3ee55611341a6ba978d64f2a898efe9ef997c3
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3033dcdcb0e9135c8bca97563001e0cd476b2a6874e7dd3ff1c0e87890bbeef2
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2080/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27e5bcdb1df00b2d6aa40ebf240a0526990af029
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b268ec0ab1ded0b1cf807843d427c8ca1d5a58f99923a1fb2a0f19be90df959
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2496/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8342c0cd7b8ce0631547c5665e27e479b6fb7427
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2247313954fdcc3dc52da901c91df46323a60acd30537e46732ac047a8d16c
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-2912/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a7b7d0b4134f7667b3c6860707ab93b3c361ea9b
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b4893e6f0d53e4fbbd76835ae9175a4db7d918e3f80d8ca44eb542f184a7a40
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3328/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..294b11b71a4b9bc2166046d8fa375b8f26e74353
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2abe2c724d7af7d15cc652e2fe530354df64dde50c0ccadd9ea4b52ec955b403
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-3744/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d7163298fb9416526777e6474aa70d9b41196e5
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5baeb8ce8698b4f981f6cd28c7d7b942e750f35955dcff00f87d8ea1b95c585d
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-416/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a9cd7f5f9852409c134823588e5106c68f5fa8ca
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee2cea8460aa29bb1d0932624b31c8e7abd00416e2208ae9eb5aea1960598d24
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-4160/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..44efa3031afe66ca6b0d9c82ac658f6780cab2e1
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d5e28929beff3fa99ab1fb1ac093702e413931937da1b2d6d0c878db5c41344
+size 132187888
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fafc70b589d6e0520076fdac27e04be46598a54
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test1/checkpoint-832/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a7d9c0cf6292257087e32deb7b0d08b81271ded4a2b606e806480fba0a89f
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9cfa2a6ff3ce788cf9c12c4afb88b9ca2c94e46c
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7e4987b1586d1a55ed2276403a9b869985bf94f8939bd6dea92f7bd2605213
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1248/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..399f317e9848f7b5acb368616ebc3e99d6a47eae
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8b8fa125ed5440c1168e74ce33d492de5d53155fb6cb4d1f6d649b4f24820ba
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-1664/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0d3c8b937722192299536f8f04a20c3c7f68fd72
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:833f2c0917396b417eacef3cacfeca122695b4e007faf162a6afb68d9e386b6d
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2080/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f758a4f542b72427552557b7da496b6ef5756dec
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:121f4441d5798e30d8e650d939fa4da776bc46931d49b9447c93cb2caafaf8bf
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2496/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..439786ef2da27e0e9b15c465aa14617be56782e5
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfefb8f1aee9a9f521cb74126896324b6b2afde5ebb138ca525d152c983922e2
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-2912/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8840f8ee659fa7ec7889a6813bdcc0abf126654b
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a37dbf1e87deb6eacd3a4623e018cc0e710b7c195ab6a30244f3622ebecd0f4a
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3328/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fe187f7bf47b0e7ceb93f608f0fe1cde05dba5c8
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c2f4653542268dfc392b93cc5750cc210e0f330fae012b9c28d96661c295a66
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-3744/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f48becf1d9f6c9a810560f2a41e354e24fba91b
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed6a3a449129e9cf5dde5daee7bbd7c69fb87e22052ca65164937249e0bdb7b
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-416/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..54c585e94330c0d02094878cbc1a87538dc3df7d
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43a7fd8a735846be654071534beccd5cc59a8b91db968965935159995596b259
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-4160/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9384b871b0f4e27005ba84dcc562dae49e931da5
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2aa468c76942d264d6265b2d7518c5460f4a63f0e0191383485a46672021d23
+size 528550256
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a440f50784e9505c9d6e4a015cefc759796ec4d7
--- /dev/null
+++ b/productivity_code_Swedish/Qwen3-4B-Base_productivity_splits_code_features_train_productivity_splits_code_features_test2/checkpoint-832/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3dd5808b5910ef225f7fefb4fa29294ee03c7397c1856f7855760414821deb
+size 5969
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8a3ef83e45551fd44e02c30a466fa4d16b6d3081
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a305489717ffb7f09694d37800511fc8105e970d422da845485beacda648e805
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1266/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3c6c5929b165d6927d7fb9f203e47dddc2d9e39d
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed3ca7e462a6f9dda89543c44e3df8eb6630e9c9a8810febe416f1b53c6647cb
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-1688/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..80a999bf5498d0528936e4386d47dfac9d9e86c7
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:871529db5fc0121f990b9ba5cb1b81d4bd577f9777c45a7425aa3f5ce97e22d2
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2110/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..59a6aad946a782e9cf92d5d31edc7de20ac66052
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13b70c2d156f9524059ddf23c74ab1c8fde7f7989e916d2c9dab9896a858021e
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2532/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a4880529704a423c0317f524419250a207f39ee
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b5410a1e2bfe1f4f1b1285f558381e41ed44c5270eac59d2a712747a7e674e9
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-2954/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..98b34d771c7d1a01201d034df2531109334e3730
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de14455a0c67a5b03c277d24e9cf24137c1e5d163329d0de56c0073cc7824d19
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3376/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2adee9fff9a2d2bdd621c45ef01b49834f23169f
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1b86e6e53406383857bceb893a71ab43d661371de4cab6c55af683ecd39d614
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3798/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6d303f053aa87b837b1f54ebce0a95f982ead023
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2fb11013528d4f89f479d0314835588ce1134e07a7c0ddf5969097d854685c
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-422/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..69641f99c90f4d3650c29d4ea9157c96d1379445
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2b37f0486dc225ee19bb58233340c01582cbe6a0e6b757c92fda732d488340
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4220/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fcf9438fccfcef51075f1ab11261959d8954c1ca
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df7b78b191b65ba9bfda9f8ac4e94553d23e873c77250c78c5a3a5a33304292
+size 528550256
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a14a967df058442729af6156a5dd1378874b039
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-844/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd6716430261876870f9d8a9a13c030ff118c36e6dde661af5760e2fb3ae358
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8f0ec3dc9c343a2c369c5c7897b2025dd0c3de5d
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1266/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf1b946ce8888b60617677c221d4f9345a1a90f299d8c81589a4297328f6f9cb
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1891d78d680487f3fcd81617a9be4094a5dd444c
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:432d4c151b4f2e819fe75b03bac489ef4066f67461c3b62362523ff204cd5cfd
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1688/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..415e93313a151c45f73744a8f85628bf6fd95fdc
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1769a198912e748ca0e5ea0bf306afdfd5f6f6edf49959437427c0403dba1660
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2110/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d5b245889666232789df267ce760e730f79cff4
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b828533887ff4c3775d0145ce96efa3497149a3196b06745d6992bc7888ddcf
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2532/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..64b45a7631a0d9c38948303621f42915744e5895
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:541c432be66eca44eb5090d931bb8dec7d1b5e6434922a136b5fb8b069d35d2e
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2954/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d18863db62a6976a50722f029fa087e3bad82916
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b17b8c5ac151dc836de67b44b428c7202ac2fc49f6875f8782ce0ac0bab0141
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3376/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a8a8757633d0429a28785790a48928a4b1cade99
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08338e683f664fb43bd57f2f1ad9784286222c25cf329dd6287c832d35a88d2c
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3798/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..75580e83603d3bf49d57a0a1aab20eca05467fb6
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:060618a44d1a4d11107a689b1faec187ceba439f81351ea758bbcdd01f4efe5a
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-422/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..651313a886f88529351e27d6614f434288835c16
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a93bf8441a40523d9705d5a84931485ee313190390af6a79a3de7e0a8cc6ed29
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4220/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7f5cb096f41ccfe8e574d77577146f11a1493b01
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a29c3c29d49743bfbddbe6d728f7d24d152490aa110bc32ccc026195f0957c6
+size 1057033224
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4d1131b805c0250f1e517771932318a8cb06f57
--- /dev/null
+++ b/productivity_original_Estonian/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-844/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:239bcb1638f2f0880dabc8a510a3159ecd9462dda9028bdcd7cf5f3253b98190
+size 6033
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eeb961260daab308690ee2424014d017c81948ff
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-3744/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92041ff3f6c36f2c0eba07422557339fa9c5291f3c862d1c259c5a20a2b5e617
+size 132187888
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..93f546b7ff4208301d1d56632f05d992da2f9787
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-416/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3282ecdf3e931422bb3dcf90b3ab7c327640022c3816e8176ad272aed74bbeb2
+size 132187888
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..98eb25e7492c2d265c9caac0cb59306a62752da5
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-4160/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fddf99881d0157cd7ce7286527f4f0b5f1497ded101b726a95b7030cc6361e0
+size 132187888
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1c88555eda53efba8e381a14444e761377fa6557
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test1/checkpoint-832/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dd25ec4e3e2a4ba2a47a382979cc571755290d6e004dec9681e1f5e7f790260
+size 132187888
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1afb2db769ef6bc745ae9a969f35e592271b6853
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1248/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df79709ef7b8e9d0108929e9d6d9e6219fc0e71dc0bf461880764bea234c2802
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..917f9ba91cb6dc51e837459c71c41ef46d414c0d
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-1664/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4028d8243e88a1ed8653bb6f1ae72125cc50df50f3c00af5503aab4a9b8b03a
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0758a013282d084b219cb1a09d6fcdd981a805ef
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2080/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e1f59a4947a3ada4ccecd49140aa5614a14c1d2a4b7a385eac2abc7979be1d7
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a1f61f61ea41667466578cf2656c3aeabd0c94f2
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2496/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37ad97e8609611d2493dbcaf94b0111929ac576e7faf32a2b00ab35e2d239d64
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5946483d879dff2e6eb1e2cf4531552a299106b7
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-2912/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc9c5790f7c7e6eec0ce381b5528e905a301910027cc490f25212c1442e068f5
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..77cbd320654f1aceba60a6be42c69eaad316b078
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3328/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b84b12a897a4b643e8092814dbc2b91333786bcfc8c45e159d58a08aa3c67c06
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f43794ef309cbd9b81142cd9f4608ee3d17f4ec
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-3744/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ee9e4e813a7a4f4b702573295829d04fd03f54f1b6cd2d77b31911491e2fb4d
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a3ca9f8d1072593bd648832cdd6495c4ccd28f99
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-416/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc523dd7dac2217bdd4e0cff500f687607e9a64260ae620fed7277afe1a33bae
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..84e5dc7c8501227ab609efd2c72304c05137b455
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-4160/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7647b1aa2182b24cd8ec9330cbd6107d057c584474b28660e5f8465847df3318
+size 1057033224
diff --git a/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f71bd1034792b14cb52aec864e5d56320caac943
--- /dev/null
+++ b/productivity_original_Swedish/Qwen3-4B-Base_productivity_splits_original_features_train_productivity_splits_original_features_test2/checkpoint-832/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1e70d71bfb1a3bc125633e6457c56a6f9cf8fd76cf63d131555db20486985be
+size 1057033224